Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'bpf-introduce-deferred-task-context-execution'

Mykyta Yatsenko says:

====================
bpf: Introduce deferred task context execution

From: Mykyta Yatsenko <yatsenko@meta.com>

This patch introduces a new mechanism for BPF programs to schedule
deferred execution in the context of a specific task using the kernel’s
task_work infrastructure.

The new bpf_task_work interface enables BPF use cases that
require sleepable subprogram execution within task context, for example,
scheduling sleepable function from the context that does not
allow sleepable, such as NMI.

Introduced kfuncs bpf_task_work_schedule_signal() and
bpf_task_work_schedule_resume() for scheduling BPF callbacks correspond
to different modes used by task_work (TWA_SIGNAL or TWA_RESUME).

The implementation manages scheduling state via metadata objects (struct
bpf_task_work_context). Pointers to bpf_task_work_context are stored
in BPF map values. State transitions are handled via an atomic
state machine (bpf_task_work_state) to ensure correctness under
concurrent usage and deletion, lifetime is guarded by refcounting and
RCU Tasks Trace.
Kfuncs call task_work_add() indirectly via irq_work to avoid locking in
potentially NMI context.

Changelog:
---
v7 -> v8
v7: https://lore.kernel.org/bpf/20250922232611.614512-1-mykyta.yatsenko5@gmail.com/
* Fix unused variable warning in patch 1
* Decrease stress test time from 2 to 1 second
* Went through CI warnings, other than unused variable, there are just
2 new in kernel/bpf/helpers.c related to newly introduced kfuncs, these
look expected.

v6 -> v7
v6: https://lore.kernel.org/bpf/20250918132615.193388-1-mykyta.yatsenko5@gmail.com/
* Added stress test
* Extending refactoring in patch 1
* Changing comment and removing one check for map->usercnt in patch 7

v5 -> v6
v5: https://lore.kernel.org/bpf/20250916233651.258458-1-mykyta.yatsenko5@gmail.com/
* Fixing readability in verifier.c:check_map_field_pointer()
* Removing BUG_ON from helpers.c

v4 -> v5
v4:
https://lore.kernel.org/all/20250915201820.248977-1-mykyta.yatsenko5@gmail.com/
* Fix invalid/null pointer dereference bug, reported by syzbot
* Nits in selftests

v3 -> v4
v3: https://lore.kernel.org/all/20250905164508.1489482-1-mykyta.yatsenko5@gmail.com/
* Modify async callback return value processing in verifier, to allow
non-zero return values.
* Change return type of the callback from void to int, as verifier
expects scalar value.
* Switched to void* for bpf_map API kfunc arguments to avoid casts.
* Addressing numerous nits and small improvements.

v2 -> v3
v2: https://lore.kernel.org/all/20250815192156.272445-1-mykyta.yatsenko5@gmail.com/
* Introduce ref counting
* Add patches with minor verifier and btf.c refactorings to avoid code
duplication
* Rework initiation of the task work scheduling to handle race with map
usercnt dropping to zero
====================

Link: https://patch.msgid.link/20250923112404.668720-1-mykyta.yatsenko5@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+1146 -110
+11
include/linux/bpf.h
··· 209 209 BPF_WORKQUEUE = (1 << 10), 210 210 BPF_UPTR = (1 << 11), 211 211 BPF_RES_SPIN_LOCK = (1 << 12), 212 + BPF_TASK_WORK = (1 << 13), 212 213 }; 213 214 214 215 enum bpf_cgroup_storage_type { ··· 263 262 int timer_off; 264 263 int wq_off; 265 264 int refcount_off; 265 + int task_work_off; 266 266 struct btf_field fields[]; 267 267 }; 268 268 ··· 365 363 return "bpf_rb_node"; 366 364 case BPF_REFCOUNT: 367 365 return "bpf_refcount"; 366 + case BPF_TASK_WORK: 367 + return "bpf_task_work"; 368 368 default: 369 369 WARN_ON_ONCE(1); 370 370 return "unknown"; ··· 405 401 return sizeof(struct bpf_rb_node); 406 402 case BPF_REFCOUNT: 407 403 return sizeof(struct bpf_refcount); 404 + case BPF_TASK_WORK: 405 + return sizeof(struct bpf_task_work); 408 406 default: 409 407 WARN_ON_ONCE(1); 410 408 return 0; ··· 439 433 return __alignof__(struct bpf_rb_node); 440 434 case BPF_REFCOUNT: 441 435 return __alignof__(struct bpf_refcount); 436 + case BPF_TASK_WORK: 437 + return __alignof__(struct bpf_task_work); 442 438 default: 443 439 WARN_ON_ONCE(1); 444 440 return 0; ··· 472 464 case BPF_KPTR_REF: 473 465 case BPF_KPTR_PERCPU: 474 466 case BPF_UPTR: 467 + case BPF_TASK_WORK: 475 468 break; 476 469 default: 477 470 WARN_ON_ONCE(1); ··· 609 600 bool lock_src); 610 601 void bpf_timer_cancel_and_free(void *timer); 611 602 void bpf_wq_cancel_and_free(void *timer); 603 + void bpf_task_work_cancel_and_free(void *timer); 612 604 void bpf_list_head_free(const struct btf_field *field, void *list_head, 613 605 struct bpf_spin_lock *spin_lock); 614 606 void bpf_rb_root_free(const struct btf_field *field, void *rb_root, ··· 2436 2426 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); 2437 2427 void bpf_obj_free_timer(const struct btf_record *rec, void *obj); 2438 2428 void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); 2429 + void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); 2439 2430 void bpf_obj_free_fields(const struct btf_record *rec, void *obj); 2440 2431 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); 2441 2432
+4
include/uapi/linux/bpf.h
··· 7436 7436 __u64 __opaque[2]; 7437 7437 } __attribute__((aligned(8))); 7438 7438 7439 + struct bpf_task_work { 7440 + __u64 __opaque; 7441 + } __attribute__((aligned(8))); 7442 + 7439 7443 struct bpf_wq { 7440 7444 __u64 __opaque[2]; 7441 7445 } __attribute__((aligned(8)));
+5 -3
kernel/bpf/arraymap.c
··· 443 443 return (void *)round_down((unsigned long)array, PAGE_SIZE); 444 444 } 445 445 446 - static void array_map_free_timers_wq(struct bpf_map *map) 446 + static void array_map_free_internal_structs(struct bpf_map *map) 447 447 { 448 448 struct bpf_array *array = container_of(map, struct bpf_array, map); 449 449 int i; ··· 451 451 /* We don't reset or free fields other than timer and workqueue 452 452 * on uref dropping to zero. 453 453 */ 454 - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { 454 + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { 455 455 for (i = 0; i < array->map.max_entries; i++) { 456 456 if (btf_record_has_field(map->record, BPF_TIMER)) 457 457 bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); 458 458 if (btf_record_has_field(map->record, BPF_WORKQUEUE)) 459 459 bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); 460 + if (btf_record_has_field(map->record, BPF_TASK_WORK)) 461 + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); 460 462 } 461 463 } 462 464 } ··· 797 795 .map_alloc = array_map_alloc, 798 796 .map_free = array_map_free, 799 797 .map_get_next_key = array_map_get_next_key, 800 - .map_release_uref = array_map_free_timers_wq, 798 + .map_release_uref = array_map_free_internal_structs, 801 799 .map_lookup_elem = array_map_lookup_elem, 802 800 .map_update_elem = array_map_update_elem, 803 801 .map_delete_elem = array_map_delete_elem,
+38 -49
kernel/bpf/btf.c
··· 3478 3478 return BTF_FIELD_FOUND; 3479 3479 } 3480 3480 3481 - #define field_mask_test_name(field_type, field_type_str) \ 3482 - if (field_mask & field_type && !strcmp(name, field_type_str)) { \ 3483 - type = field_type; \ 3484 - goto end; \ 3485 - } 3486 - 3487 3481 static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, 3488 - u32 field_mask, u32 *seen_mask, 3489 - int *align, int *sz) 3482 + u32 field_mask, u32 *seen_mask, int *align, int *sz) 3490 3483 { 3491 - int type = 0; 3484 + const struct { 3485 + enum btf_field_type type; 3486 + const char *const name; 3487 + const bool is_unique; 3488 + } field_types[] = { 3489 + { BPF_SPIN_LOCK, "bpf_spin_lock", true }, 3490 + { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, 3491 + { BPF_TIMER, "bpf_timer", true }, 3492 + { BPF_WORKQUEUE, "bpf_wq", true }, 3493 + { BPF_TASK_WORK, "bpf_task_work", true }, 3494 + { BPF_LIST_HEAD, "bpf_list_head", false }, 3495 + { BPF_LIST_NODE, "bpf_list_node", false }, 3496 + { BPF_RB_ROOT, "bpf_rb_root", false }, 3497 + { BPF_RB_NODE, "bpf_rb_node", false }, 3498 + { BPF_REFCOUNT, "bpf_refcount", false }, 3499 + }; 3500 + int type = 0, i; 3492 3501 const char *name = __btf_name_by_offset(btf, var_type->name_off); 3502 + const char *field_type_name; 3503 + enum btf_field_type field_type; 3504 + bool is_unique; 3493 3505 3494 - if (field_mask & BPF_SPIN_LOCK) { 3495 - if (!strcmp(name, "bpf_spin_lock")) { 3496 - if (*seen_mask & BPF_SPIN_LOCK) 3506 + for (i = 0; i < ARRAY_SIZE(field_types); ++i) { 3507 + field_type = field_types[i].type; 3508 + field_type_name = field_types[i].name; 3509 + is_unique = field_types[i].is_unique; 3510 + if (!(field_mask & field_type) || strcmp(name, field_type_name)) 3511 + continue; 3512 + if (is_unique) { 3513 + if (*seen_mask & field_type) 3497 3514 return -E2BIG; 3498 - *seen_mask |= BPF_SPIN_LOCK; 3499 - type = BPF_SPIN_LOCK; 3500 - goto end; 3515 + *seen_mask |= field_type; 3501 3516 } 3517 + type = field_type; 3518 + goto end; 3502 3519 } 3503 - if (field_mask & BPF_RES_SPIN_LOCK) { 3504 - if (!strcmp(name, "bpf_res_spin_lock")) { 3505 - if (*seen_mask & BPF_RES_SPIN_LOCK) 3506 - return -E2BIG; 3507 - *seen_mask |= BPF_RES_SPIN_LOCK; 3508 - type = BPF_RES_SPIN_LOCK; 3509 - goto end; 3510 - } 3511 - } 3512 - if (field_mask & BPF_TIMER) { 3513 - if (!strcmp(name, "bpf_timer")) { 3514 - if (*seen_mask & BPF_TIMER) 3515 - return -E2BIG; 3516 - *seen_mask |= BPF_TIMER; 3517 - type = BPF_TIMER; 3518 - goto end; 3519 - } 3520 - } 3521 - if (field_mask & BPF_WORKQUEUE) { 3522 - if (!strcmp(name, "bpf_wq")) { 3523 - if (*seen_mask & BPF_WORKQUEUE) 3524 - return -E2BIG; 3525 - *seen_mask |= BPF_WORKQUEUE; 3526 - type = BPF_WORKQUEUE; 3527 - goto end; 3528 - } 3529 - } 3530 - field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); 3531 - field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); 3532 - field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); 3533 - field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); 3534 - field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); 3535 3520 3536 3521 /* Only return BPF_KPTR when all other types with matchable names fail */ 3537 3522 if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { ··· 3529 3544 *align = btf_field_type_align(type); 3530 3545 return type; 3531 3546 } 3532 - 3533 - #undef field_mask_test_name 3534 3547 3535 3548 /* Repeat a number of fields for a specified number of times. 3536 3549 * ··· 3676 3693 case BPF_LIST_NODE: 3677 3694 case BPF_RB_NODE: 3678 3695 case BPF_REFCOUNT: 3696 + case BPF_TASK_WORK: 3679 3697 ret = btf_find_struct(btf, var_type, off, sz, field_type, 3680 3698 info_cnt ? &info[0] : &tmp); 3681 3699 if (ret < 0) ··· 3969 3985 rec->timer_off = -EINVAL; 3970 3986 rec->wq_off = -EINVAL; 3971 3987 rec->refcount_off = -EINVAL; 3988 + rec->task_work_off = -EINVAL; 3972 3989 for (i = 0; i < cnt; i++) { 3973 3990 field_type_size = btf_field_type_size(info_arr[i].type); 3974 3991 if (info_arr[i].off + field_type_size > value_size) { ··· 4008 4023 WARN_ON_ONCE(rec->wq_off >= 0); 4009 4024 /* Cache offset for faster lookup at runtime */ 4010 4025 rec->wq_off = rec->fields[i].offset; 4026 + break; 4027 + case BPF_TASK_WORK: 4028 + WARN_ON_ONCE(rec->task_work_off >= 0); 4029 + rec->task_work_off = rec->fields[i].offset; 4011 4030 break; 4012 4031 case BPF_REFCOUNT: 4013 4032 WARN_ON_ONCE(rec->refcount_off >= 0);
+23 -20
kernel/bpf/hashtab.c
··· 215 215 return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); 216 216 } 217 217 218 - static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) 218 + static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) 219 + { 220 + if (btf_record_has_field(htab->map.record, BPF_TIMER)) 221 + bpf_obj_free_timer(htab->map.record, 222 + htab_elem_value(elem, htab->map.key_size)); 223 + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) 224 + bpf_obj_free_workqueue(htab->map.record, 225 + htab_elem_value(elem, htab->map.key_size)); 226 + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) 227 + bpf_obj_free_task_work(htab->map.record, 228 + htab_elem_value(elem, htab->map.key_size)); 229 + } 230 + 231 + static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) 219 232 { 220 233 u32 num_entries = htab->map.max_entries; 221 234 int i; ··· 240 227 struct htab_elem *elem; 241 228 242 229 elem = get_htab_elem(htab, i); 243 - if (btf_record_has_field(htab->map.record, BPF_TIMER)) 244 - bpf_obj_free_timer(htab->map.record, 245 - htab_elem_value(elem, htab->map.key_size)); 246 - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) 247 - bpf_obj_free_workqueue(htab->map.record, 248 - htab_elem_value(elem, htab->map.key_size)); 230 + htab_free_internal_structs(htab, elem); 249 231 cond_resched(); 250 232 } 251 233 } ··· 1498 1490 } 1499 1491 } 1500 1492 1501 - static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) 1493 + static void htab_free_malloced_internal_structs(struct bpf_htab *htab) 1502 1494 { 1503 1495 int i; 1504 1496 ··· 1510 1502 1511 1503 hlist_nulls_for_each_entry(l, n, head, hash_node) { 1512 1504 /* We only free timer on uref dropping to zero */ 1513 - if (btf_record_has_field(htab->map.record, BPF_TIMER)) 1514 - bpf_obj_free_timer(htab->map.record, 1515 - htab_elem_value(l, htab->map.key_size)); 1516 - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) 1517 - bpf_obj_free_workqueue(htab->map.record, 1518 - htab_elem_value(l, htab->map.key_size)); 1505 + htab_free_internal_structs(htab, l); 1519 1506 } 1520 1507 cond_resched_rcu(); 1521 1508 } 1522 1509 rcu_read_unlock(); 1523 1510 } 1524 1511 1525 - static void htab_map_free_timers_and_wq(struct bpf_map *map) 1512 + static void htab_map_free_internal_structs(struct bpf_map *map) 1526 1513 { 1527 1514 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1528 1515 1529 1516 /* We only free timer and workqueue on uref dropping to zero */ 1530 - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { 1517 + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { 1531 1518 if (!htab_is_prealloc(htab)) 1532 - htab_free_malloced_timers_and_wq(htab); 1519 + htab_free_malloced_internal_structs(htab); 1533 1520 else 1534 - htab_free_prealloced_timers_and_wq(htab); 1521 + htab_free_prealloced_internal_structs(htab); 1535 1522 } 1536 1523 } 1537 1524 ··· 2258 2255 .map_alloc = htab_map_alloc, 2259 2256 .map_free = htab_map_free, 2260 2257 .map_get_next_key = htab_map_get_next_key, 2261 - .map_release_uref = htab_map_free_timers_and_wq, 2258 + .map_release_uref = htab_map_free_internal_structs, 2262 2259 .map_lookup_elem = htab_map_lookup_elem, 2263 2260 .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, 2264 2261 .map_update_elem = htab_map_update_elem, ··· 2279 2276 .map_alloc = htab_map_alloc, 2280 2277 .map_free = htab_map_free, 2281 2278 .map_get_next_key = htab_map_get_next_key, 2282 - .map_release_uref = htab_map_free_timers_and_wq, 2279 + .map_release_uref = htab_map_free_internal_structs, 2283 2280 .map_lookup_elem = htab_lru_map_lookup_elem, 2284 2281 .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, 2285 2282 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
+341 -17
kernel/bpf/helpers.c
··· 26 26 #include <linux/bpf_verifier.h> 27 27 #include <linux/uaccess.h> 28 28 #include <linux/verification.h> 29 + #include <linux/task_work.h> 30 + #include <linux/irq_work.h> 29 31 30 32 #include "../../lib/kstrtox.h" 31 33 ··· 1084 1082 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 1085 1083 }; 1086 1084 1085 + static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) 1086 + { 1087 + if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1088 + struct bpf_array *array = container_of(map, struct bpf_array, map); 1089 + 1090 + *arr_idx = ((char *)value - array->value) / array->elem_size; 1091 + return arr_idx; 1092 + } 1093 + return (void *)value - round_up(map->key_size, 8); 1094 + } 1095 + 1087 1096 struct bpf_async_cb { 1088 1097 struct bpf_map *map; 1089 1098 struct bpf_prog *prog; ··· 1177 1164 * bpf_map_delete_elem() on the same timer. 1178 1165 */ 1179 1166 this_cpu_write(hrtimer_running, t); 1180 - if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1181 - struct bpf_array *array = container_of(map, struct bpf_array, map); 1182 1167 1183 - /* compute the key */ 1184 - idx = ((char *)value - array->value) / array->elem_size; 1185 - key = &idx; 1186 - } else { /* hash or lru */ 1187 - key = value - round_up(map->key_size, 8); 1188 - } 1168 + key = map_key_from_value(map, value, &idx); 1189 1169 1190 1170 callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); 1191 1171 /* The verifier checked that return value is zero. */ ··· 1204 1198 if (!callback_fn) 1205 1199 return; 1206 1200 1207 - if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1208 - struct bpf_array *array = container_of(map, struct bpf_array, map); 1209 - 1210 - /* compute the key */ 1211 - idx = ((char *)value - array->value) / array->elem_size; 1212 - key = &idx; 1213 - } else { /* hash or lru */ 1214 - key = value - round_up(map->key_size, 8); 1215 - } 1201 + key = map_key_from_value(map, value, &idx); 1216 1202 1217 1203 rcu_read_lock_trace(); 1218 1204 migrate_disable(); ··· 3904 3906 } 3905 3907 #endif /* CONFIG_KEYS */ 3906 3908 3909 + typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); 3910 + 3911 + enum bpf_task_work_state { 3912 + /* bpf_task_work is ready to be used */ 3913 + BPF_TW_STANDBY = 0, 3914 + /* irq work scheduling in progress */ 3915 + BPF_TW_PENDING, 3916 + /* task work scheduling in progress */ 3917 + BPF_TW_SCHEDULING, 3918 + /* task work is scheduled successfully */ 3919 + BPF_TW_SCHEDULED, 3920 + /* callback is running */ 3921 + BPF_TW_RUNNING, 3922 + /* associated BPF map value is deleted */ 3923 + BPF_TW_FREED, 3924 + }; 3925 + 3926 + struct bpf_task_work_ctx { 3927 + enum bpf_task_work_state state; 3928 + refcount_t refcnt; 3929 + struct callback_head work; 3930 + struct irq_work irq_work; 3931 + /* bpf_prog that schedules task work */ 3932 + struct bpf_prog *prog; 3933 + /* task for which callback is scheduled */ 3934 + struct task_struct *task; 3935 + /* the map and map value associated with this context */ 3936 + struct bpf_map *map; 3937 + void *map_val; 3938 + enum task_work_notify_mode mode; 3939 + bpf_task_work_callback_t callback_fn; 3940 + struct rcu_head rcu; 3941 + } __aligned(8); 3942 + 3943 + /* Actual type for struct bpf_task_work */ 3944 + struct bpf_task_work_kern { 3945 + struct bpf_task_work_ctx *ctx; 3946 + }; 3947 + 3948 + static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) 3949 + { 3950 + if (ctx->prog) { 3951 + bpf_prog_put(ctx->prog); 3952 + ctx->prog = NULL; 3953 + } 3954 + if (ctx->task) { 3955 + bpf_task_release(ctx->task); 3956 + ctx->task = NULL; 3957 + } 3958 + } 3959 + 3960 + static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) 3961 + { 3962 + return refcount_inc_not_zero(&ctx->refcnt); 3963 + } 3964 + 3965 + static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) 3966 + { 3967 + if (!refcount_dec_and_test(&ctx->refcnt)) 3968 + return; 3969 + 3970 + bpf_task_work_ctx_reset(ctx); 3971 + 3972 + /* bpf_mem_free expects migration to be disabled */ 3973 + migrate_disable(); 3974 + bpf_mem_free(&bpf_global_ma, ctx); 3975 + migrate_enable(); 3976 + } 3977 + 3978 + static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) 3979 + { 3980 + /* 3981 + * Scheduled task_work callback holds ctx ref, so if we successfully 3982 + * cancelled, we put that ref on callback's behalf. If we couldn't 3983 + * cancel, callback will inevitably run or has already completed 3984 + * running, and it would have taken care of its ctx ref itself. 3985 + */ 3986 + if (task_work_cancel(ctx->task, &ctx->work)) 3987 + bpf_task_work_ctx_put(ctx); 3988 + } 3989 + 3990 + static void bpf_task_work_callback(struct callback_head *cb) 3991 + { 3992 + struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); 3993 + enum bpf_task_work_state state; 3994 + u32 idx; 3995 + void *key; 3996 + 3997 + /* Read lock is needed to protect ctx and map key/value access */ 3998 + guard(rcu_tasks_trace)(); 3999 + /* 4000 + * This callback may start running before bpf_task_work_irq() switched to 4001 + * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. 4002 + */ 4003 + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); 4004 + if (state == BPF_TW_SCHEDULED) 4005 + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); 4006 + if (state == BPF_TW_FREED) { 4007 + bpf_task_work_ctx_put(ctx); 4008 + return; 4009 + } 4010 + 4011 + key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); 4012 + 4013 + migrate_disable(); 4014 + ctx->callback_fn(ctx->map, key, ctx->map_val); 4015 + migrate_enable(); 4016 + 4017 + bpf_task_work_ctx_reset(ctx); 4018 + (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); 4019 + 4020 + bpf_task_work_ctx_put(ctx); 4021 + } 4022 + 4023 + static void bpf_task_work_irq(struct irq_work *irq_work) 4024 + { 4025 + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); 4026 + enum bpf_task_work_state state; 4027 + int err; 4028 + 4029 + guard(rcu_tasks_trace)(); 4030 + 4031 + if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { 4032 + bpf_task_work_ctx_put(ctx); 4033 + return; 4034 + } 4035 + 4036 + err = task_work_add(ctx->task, &ctx->work, ctx->mode); 4037 + if (err) { 4038 + bpf_task_work_ctx_reset(ctx); 4039 + /* 4040 + * try to switch back to STANDBY for another task_work reuse, but we might have 4041 + * gone to FREED already, which is fine as we already cleaned up after ourselves 4042 + */ 4043 + (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); 4044 + bpf_task_work_ctx_put(ctx); 4045 + return; 4046 + } 4047 + 4048 + /* 4049 + * It's technically possible for just scheduled task_work callback to 4050 + * complete running by now, going SCHEDULING -> RUNNING and then 4051 + * dropping its ctx refcount. Instead of capturing extra ref just to 4052 + * protected below ctx->state access, we rely on RCU protection to 4053 + * perform below SCHEDULING -> SCHEDULED attempt. 4054 + */ 4055 + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); 4056 + if (state == BPF_TW_FREED) 4057 + bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ 4058 + } 4059 + 4060 + static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, 4061 + struct bpf_map *map) 4062 + { 4063 + struct bpf_task_work_kern *twk = (void *)tw; 4064 + struct bpf_task_work_ctx *ctx, *old_ctx; 4065 + 4066 + ctx = READ_ONCE(twk->ctx); 4067 + if (ctx) 4068 + return ctx; 4069 + 4070 + ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); 4071 + if (!ctx) 4072 + return ERR_PTR(-ENOMEM); 4073 + 4074 + memset(ctx, 0, sizeof(*ctx)); 4075 + refcount_set(&ctx->refcnt, 1); /* map's own ref */ 4076 + ctx->state = BPF_TW_STANDBY; 4077 + 4078 + old_ctx = cmpxchg(&twk->ctx, NULL, ctx); 4079 + if (old_ctx) { 4080 + /* 4081 + * tw->ctx is set by concurrent BPF program, release allocated 4082 + * memory and try to reuse already set context. 4083 + */ 4084 + bpf_mem_free(&bpf_global_ma, ctx); 4085 + return old_ctx; 4086 + } 4087 + 4088 + return ctx; /* Success */ 4089 + } 4090 + 4091 + static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, 4092 + struct bpf_map *map) 4093 + { 4094 + struct bpf_task_work_ctx *ctx; 4095 + 4096 + ctx = bpf_task_work_fetch_ctx(tw, map); 4097 + if (IS_ERR(ctx)) 4098 + return ctx; 4099 + 4100 + /* try to get ref for task_work callback to hold */ 4101 + if (!bpf_task_work_ctx_tryget(ctx)) 4102 + return ERR_PTR(-EBUSY); 4103 + 4104 + if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { 4105 + /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ 4106 + bpf_task_work_ctx_put(ctx); 4107 + return ERR_PTR(-EBUSY); 4108 + } 4109 + 4110 + /* 4111 + * If no process or bpffs is holding a reference to the map, no new callbacks should be 4112 + * scheduled. This does not address any race or correctness issue, but rather is a policy 4113 + * choice: dropping user references should stop everything. 4114 + */ 4115 + if (!atomic64_read(&map->usercnt)) { 4116 + /* drop ref we just got for task_work callback itself */ 4117 + bpf_task_work_ctx_put(ctx); 4118 + /* transfer map's ref into cancel_and_free() */ 4119 + bpf_task_work_cancel_and_free(tw); 4120 + return ERR_PTR(-EBUSY); 4121 + } 4122 + 4123 + return ctx; 4124 + } 4125 + 4126 + static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, 4127 + struct bpf_map *map, bpf_task_work_callback_t callback_fn, 4128 + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) 4129 + { 4130 + struct bpf_prog *prog; 4131 + struct bpf_task_work_ctx *ctx; 4132 + int err; 4133 + 4134 + BTF_TYPE_EMIT(struct bpf_task_work); 4135 + 4136 + prog = bpf_prog_inc_not_zero(aux->prog); 4137 + if (IS_ERR(prog)) 4138 + return -EBADF; 4139 + task = bpf_task_acquire(task); 4140 + if (!task) { 4141 + err = -EBADF; 4142 + goto release_prog; 4143 + } 4144 + 4145 + ctx = bpf_task_work_acquire_ctx(tw, map); 4146 + if (IS_ERR(ctx)) { 4147 + err = PTR_ERR(ctx); 4148 + goto release_all; 4149 + } 4150 + 4151 + ctx->task = task; 4152 + ctx->callback_fn = callback_fn; 4153 + ctx->prog = prog; 4154 + ctx->mode = mode; 4155 + ctx->map = map; 4156 + ctx->map_val = (void *)tw - map->record->task_work_off; 4157 + init_task_work(&ctx->work, bpf_task_work_callback); 4158 + init_irq_work(&ctx->irq_work, bpf_task_work_irq); 4159 + 4160 + irq_work_queue(&ctx->irq_work); 4161 + return 0; 4162 + 4163 + release_all: 4164 + bpf_task_release(task); 4165 + release_prog: 4166 + bpf_prog_put(prog); 4167 + return err; 4168 + } 4169 + 4170 + /** 4171 + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode 4172 + * @task: Task struct for which callback should be scheduled 4173 + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping 4174 + * @map__map: bpf_map that embeds struct bpf_task_work in the values 4175 + * @callback: pointer to BPF subprogram to call 4176 + * @aux__prog: user should pass NULL 4177 + * 4178 + * Return: 0 if task work has been scheduled successfully, negative error code otherwise 4179 + */ 4180 + __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, 4181 + void *map__map, bpf_task_work_callback_t callback, 4182 + void *aux__prog) 4183 + { 4184 + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); 4185 + } 4186 + 4187 + /** 4188 + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode 4189 + * @task: Task struct for which callback should be scheduled 4190 + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping 4191 + * @map__map: bpf_map that embeds struct bpf_task_work in the values 4192 + * @callback: pointer to BPF subprogram to call 4193 + * @aux__prog: user should pass NULL 4194 + * 4195 + * Return: 0 if task work has been scheduled successfully, negative error code otherwise 4196 + */ 4197 + __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, 4198 + void *map__map, bpf_task_work_callback_t callback, 4199 + void *aux__prog) 4200 + { 4201 + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); 4202 + } 4203 + 3907 4204 __bpf_kfunc_end_defs(); 4205 + 4206 + static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) 4207 + { 4208 + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); 4209 + 4210 + bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ 4211 + bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ 4212 + } 4213 + 4214 + void bpf_task_work_cancel_and_free(void *val) 4215 + { 4216 + struct bpf_task_work_kern *twk = val; 4217 + struct bpf_task_work_ctx *ctx; 4218 + enum bpf_task_work_state state; 4219 + 4220 + ctx = xchg(&twk->ctx, NULL); 4221 + if (!ctx) 4222 + return; 4223 + 4224 + state = xchg(&ctx->state, BPF_TW_FREED); 4225 + if (state == BPF_TW_SCHEDULED) { 4226 + /* run in irq_work to avoid locks in NMI */ 4227 + init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); 4228 + irq_work_queue(&ctx->irq_work); 4229 + return; 4230 + } 4231 + 4232 + bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ 4233 + } 3908 4234 3909 4235 BTF_KFUNCS_START(generic_btf_ids) 3910 4236 #ifdef CONFIG_CRASH_DUMP ··· 4372 4050 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) 4373 4051 #endif 4374 4052 BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) 4053 + BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) 4054 + BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) 4375 4055 BTF_KFUNCS_END(common_btf_ids) 4376 4056 4377 4057 static const struct btf_kfunc_id_set common_kfunc_set = {
+15 -1
kernel/bpf/syscall.c
··· 674 674 case BPF_TIMER: 675 675 case BPF_REFCOUNT: 676 676 case BPF_WORKQUEUE: 677 + case BPF_TASK_WORK: 677 678 /* Nothing to release */ 678 679 break; 679 680 default: ··· 728 727 case BPF_TIMER: 729 728 case BPF_REFCOUNT: 730 729 case BPF_WORKQUEUE: 730 + case BPF_TASK_WORK: 731 731 /* Nothing to acquire */ 732 732 break; 733 733 default: ··· 787 785 bpf_wq_cancel_and_free(obj + rec->wq_off); 788 786 } 789 787 788 + void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) 789 + { 790 + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) 791 + return; 792 + bpf_task_work_cancel_and_free(obj + rec->task_work_off); 793 + } 794 + 790 795 void bpf_obj_free_fields(const struct btf_record *rec, void *obj) 791 796 { 792 797 const struct btf_field *fields; ··· 817 808 break; 818 809 case BPF_WORKQUEUE: 819 810 bpf_wq_cancel_and_free(field_ptr); 811 + break; 812 + case BPF_TASK_WORK: 813 + bpf_task_work_cancel_and_free(field_ptr); 820 814 break; 821 815 case BPF_KPTR_UNREF: 822 816 WRITE_ONCE(*(u64 *)field_ptr, 0); ··· 1252 1240 1253 1241 map->record = btf_parse_fields(btf, value_type, 1254 1242 BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1255 - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, 1243 + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | 1244 + BPF_TASK_WORK, 1256 1245 map->value_size); 1257 1246 if (!IS_ERR_OR_NULL(map->record)) { 1258 1247 int i; ··· 1285 1272 break; 1286 1273 case BPF_TIMER: 1287 1274 case BPF_WORKQUEUE: 1275 + case BPF_TASK_WORK: 1288 1276 if (map->map_type != BPF_MAP_TYPE_HASH && 1289 1277 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1290 1278 map->map_type != BPF_MAP_TYPE_ARRAY) {
+149 -20
kernel/bpf/verifier.c
··· 2224 2224 /* transfer reg's id which is unique for every map_lookup_elem 2225 2225 * as UID of the inner map. 2226 2226 */ 2227 - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) 2227 + if (btf_record_has_field(map->inner_map_meta->record, 2228 + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { 2228 2229 reg->map_uid = reg->id; 2229 - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) 2230 - reg->map_uid = reg->id; 2230 + } 2231 2231 } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { 2232 2232 reg->type = PTR_TO_XDP_SOCK; 2233 2233 } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || ··· 8431 8431 return 0; 8432 8432 } 8433 8433 8434 - static int process_timer_func(struct bpf_verifier_env *env, int regno, 8435 - struct bpf_call_arg_meta *meta) 8434 + /* Check if @regno is a pointer to a specific field in a map value */ 8435 + static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, 8436 + enum btf_field_type field_type) 8436 8437 { 8437 8438 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8438 8439 bool is_const = tnum_is_const(reg->var_off); 8439 8440 struct bpf_map *map = reg->map_ptr; 8440 8441 u64 val = reg->var_off.value; 8442 + const char *struct_name = btf_field_type_name(field_type); 8443 + int field_off = -1; 8441 8444 8442 8445 if (!is_const) { 8443 8446 verbose(env, 8444 - "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", 8445 - regno); 8447 + "R%d doesn't have constant offset. %s has to be at the constant offset\n", 8448 + regno, struct_name); 8446 8449 return -EINVAL; 8447 8450 } 8448 8451 if (!map->btf) { 8449 - verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", 8450 - map->name); 8452 + verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name, 8453 + struct_name); 8451 8454 return -EINVAL; 8452 8455 } 8453 - if (!btf_record_has_field(map->record, BPF_TIMER)) { 8454 - verbose(env, "map '%s' has no valid bpf_timer\n", map->name); 8456 + if (!btf_record_has_field(map->record, field_type)) { 8457 + verbose(env, "map '%s' has no valid %s\n", map->name, struct_name); 8455 8458 return -EINVAL; 8456 8459 } 8457 - if (map->record->timer_off != val + reg->off) { 8458 - verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", 8459 - val + reg->off, map->record->timer_off); 8460 + switch (field_type) { 8461 + case BPF_TIMER: 8462 + field_off = map->record->timer_off; 8463 + break; 8464 + case BPF_TASK_WORK: 8465 + field_off = map->record->task_work_off; 8466 + break; 8467 + default: 8468 + verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); 8460 8469 return -EINVAL; 8461 8470 } 8471 + if (field_off != val + reg->off) { 8472 + verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", 8473 + val + reg->off, struct_name, field_off); 8474 + return -EINVAL; 8475 + } 8476 + return 0; 8477 + } 8478 + 8479 + static int process_timer_func(struct bpf_verifier_env *env, int regno, 8480 + struct bpf_call_arg_meta *meta) 8481 + { 8482 + struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8483 + struct bpf_map *map = reg->map_ptr; 8484 + int err; 8485 + 8486 + err = check_map_field_pointer(env, regno, BPF_TIMER); 8487 + if (err) 8488 + return err; 8489 + 8462 8490 if (meta->map_ptr) { 8463 8491 verifier_bug(env, "Two map pointers in a timer helper"); 8464 8492 return -EFAULT; ··· 8511 8483 verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", 8512 8484 val + reg->off, map->record->wq_off); 8513 8485 return -EINVAL; 8486 + } 8487 + meta->map.uid = reg->map_uid; 8488 + meta->map.ptr = map; 8489 + return 0; 8490 + } 8491 + 8492 + static int process_task_work_func(struct bpf_verifier_env *env, int regno, 8493 + struct bpf_kfunc_call_arg_meta *meta) 8494 + { 8495 + struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8496 + struct bpf_map *map = reg->map_ptr; 8497 + int err; 8498 + 8499 + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); 8500 + if (err) 8501 + return err; 8502 + 8503 + if (meta->map.ptr) { 8504 + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); 8505 + return -EFAULT; 8514 8506 } 8515 8507 meta->map.uid = reg->map_uid; 8516 8508 meta->map.ptr = map; ··· 10366 10318 struct bpf_func_state *callee, 10367 10319 int insn_idx); 10368 10320 10321 + static bool is_task_work_add_kfunc(u32 func_id); 10322 + 10369 10323 static int set_callee_state(struct bpf_verifier_env *env, 10370 10324 struct bpf_func_state *caller, 10371 10325 struct bpf_func_state *callee, int insn_idx); ··· 10586 10536 env->subprog_info[subprog].is_async_cb = true; 10587 10537 async_cb = push_async_cb(env, env->subprog_info[subprog].start, 10588 10538 insn_idx, subprog, 10589 - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); 10539 + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || 10540 + is_task_work_add_kfunc(insn->imm)); 10590 10541 if (!async_cb) 10591 10542 return -EFAULT; 10592 10543 callee = async_cb->frame[0]; ··· 10815 10764 __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); 10816 10765 __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 10817 10766 callee->in_async_callback_fn = true; 10818 - callee->callback_ret_range = retval_range(0, 1); 10767 + callee->callback_ret_range = retval_range(0, 0); 10819 10768 return 0; 10820 10769 } 10821 10770 ··· 10899 10848 __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 10900 10849 callee->in_callback_fn = true; 10901 10850 callee->callback_ret_range = retval_range(0, 1); 10851 + return 0; 10852 + } 10853 + 10854 + static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, 10855 + struct bpf_func_state *caller, 10856 + struct bpf_func_state *callee, 10857 + int insn_idx) 10858 + { 10859 + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; 10860 + 10861 + /* 10862 + * callback_fn(struct bpf_map *map, void *key, void *value); 10863 + */ 10864 + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; 10865 + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); 10866 + callee->regs[BPF_REG_1].map_ptr = map_ptr; 10867 + 10868 + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; 10869 + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); 10870 + callee->regs[BPF_REG_2].map_ptr = map_ptr; 10871 + 10872 + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; 10873 + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); 10874 + callee->regs[BPF_REG_3].map_ptr = map_ptr; 10875 + 10876 + /* unused */ 10877 + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); 10878 + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 10879 + callee->in_async_callback_fn = true; 10880 + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); 10902 10881 return 0; 10903 10882 } 10904 10883 ··· 12056 11975 KF_ARG_RB_NODE_ID, 12057 11976 KF_ARG_WORKQUEUE_ID, 12058 11977 KF_ARG_RES_SPIN_LOCK_ID, 11978 + KF_ARG_TASK_WORK_ID, 12059 11979 }; 12060 11980 12061 11981 BTF_ID_LIST(kf_arg_btf_ids) ··· 12067 11985 BTF_ID(struct, bpf_rb_node) 12068 11986 BTF_ID(struct, bpf_wq) 12069 11987 BTF_ID(struct, bpf_res_spin_lock) 11988 + BTF_ID(struct, bpf_task_work) 12070 11989 12071 11990 static bool __is_kfunc_ptr_arg_type(const struct btf *btf, 12072 11991 const struct btf_param *arg, int type) ··· 12114 12031 static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) 12115 12032 { 12116 12033 return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); 12034 + } 12035 + 12036 + static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) 12037 + { 12038 + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); 12117 12039 } 12118 12040 12119 12041 static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) ··· 12208 12120 KF_ARG_PTR_TO_WORKQUEUE, 12209 12121 KF_ARG_PTR_TO_IRQ_FLAG, 12210 12122 KF_ARG_PTR_TO_RES_SPIN_LOCK, 12123 + KF_ARG_PTR_TO_TASK_WORK, 12211 12124 }; 12212 12125 12213 12126 enum special_kfunc_type { ··· 12258 12169 KF_bpf_res_spin_lock_irqsave, 12259 12170 KF_bpf_res_spin_unlock_irqrestore, 12260 12171 KF___bpf_trap, 12172 + KF_bpf_task_work_schedule_signal, 12173 + KF_bpf_task_work_schedule_resume, 12261 12174 }; 12262 12175 12263 12176 BTF_ID_LIST(special_kfunc_list) ··· 12328 12237 BTF_ID(func, bpf_res_spin_lock_irqsave) 12329 12238 BTF_ID(func, bpf_res_spin_unlock_irqrestore) 12330 12239 BTF_ID(func, __bpf_trap) 12240 + BTF_ID(func, bpf_task_work_schedule_signal) 12241 + BTF_ID(func, bpf_task_work_schedule_resume) 12242 + 12243 + static bool is_task_work_add_kfunc(u32 func_id) 12244 + { 12245 + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || 12246 + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; 12247 + } 12331 12248 12332 12249 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) 12333 12250 { ··· 12425 12326 12426 12327 if (is_kfunc_arg_wq(meta->btf, &args[argno])) 12427 12328 return KF_ARG_PTR_TO_WORKQUEUE; 12329 + 12330 + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) 12331 + return KF_ARG_PTR_TO_TASK_WORK; 12428 12332 12429 12333 if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) 12430 12334 return KF_ARG_PTR_TO_IRQ_FLAG; ··· 12772 12670 12773 12671 static bool is_async_callback_calling_kfunc(u32 btf_id) 12774 12672 { 12775 - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; 12673 + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || 12674 + is_task_work_add_kfunc(btf_id); 12776 12675 } 12777 12676 12778 12677 static bool is_bpf_throw_kfunc(struct bpf_insn *insn) ··· 13154 13051 verbose(env, "pointer in R%d isn't map pointer\n", regno); 13155 13052 return -EINVAL; 13156 13053 } 13157 - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { 13054 + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || 13055 + reg->map_ptr->record->task_work_off >= 0)) { 13158 13056 /* Use map_uid (which is unique id of inner map) to reject: 13159 13057 * inner_map1 = bpf_map_lookup_elem(outer_map, key1) 13160 13058 * inner_map2 = bpf_map_lookup_elem(outer_map, key2) ··· 13170 13066 */ 13171 13067 if (meta->map.ptr != reg->map_ptr || 13172 13068 meta->map.uid != reg->map_uid) { 13069 + if (reg->map_ptr->record->task_work_off >= 0) { 13070 + verbose(env, 13071 + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", 13072 + meta->map.uid, reg->map_uid); 13073 + return -EINVAL; 13074 + } 13173 13075 verbose(env, 13174 13076 "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", 13175 13077 meta->map.uid, reg->map_uid); ··· 13214 13104 case KF_ARG_PTR_TO_REFCOUNTED_KPTR: 13215 13105 case KF_ARG_PTR_TO_CONST_STR: 13216 13106 case KF_ARG_PTR_TO_WORKQUEUE: 13107 + case KF_ARG_PTR_TO_TASK_WORK: 13217 13108 case KF_ARG_PTR_TO_IRQ_FLAG: 13218 13109 case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13219 13110 break; ··· 13505 13394 return -EINVAL; 13506 13395 } 13507 13396 ret = process_wq_func(env, regno, meta); 13397 + if (ret < 0) 13398 + return ret; 13399 + break; 13400 + case KF_ARG_PTR_TO_TASK_WORK: 13401 + if (reg->type != PTR_TO_MAP_VALUE) { 13402 + verbose(env, "arg#%d doesn't point to a map value\n", i); 13403 + return -EINVAL; 13404 + } 13405 + ret = process_task_work_func(env, regno, meta); 13508 13406 if (ret < 0) 13509 13407 return ret; 13510 13408 break; ··· 13876 13756 if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { 13877 13757 err = push_callback_call(env, insn, insn_idx, meta.subprogno, 13878 13758 set_timer_callback_state); 13759 + if (err) { 13760 + verbose(env, "kfunc %s#%d failed callback verification\n", 13761 + func_name, meta.func_id); 13762 + return err; 13763 + } 13764 + } 13765 + 13766 + if (is_task_work_add_kfunc(meta.func_id)) { 13767 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 13768 + set_task_work_schedule_callback_state); 13879 13769 if (err) { 13880 13770 verbose(env, "kfunc %s#%d failed callback verification\n", 13881 13771 func_name, meta.func_id); ··· 17178 17048 } 17179 17049 17180 17050 if (frame->in_async_callback_fn) { 17181 - /* enforce return zero from async callbacks like timer */ 17182 17051 exit_ctx = "At async callback return"; 17183 - range = retval_range(0, 0); 17052 + range = frame->callback_ret_range; 17184 17053 goto enforce_retval; 17185 17054 } 17186 17055
+4
tools/include/uapi/linux/bpf.h
··· 7436 7436 __u64 __opaque[2]; 7437 7437 } __attribute__((aligned(8))); 7438 7438 7439 + struct bpf_task_work { 7440 + __u64 __opaque; 7441 + } __attribute__((aligned(8))); 7442 + 7439 7443 struct bpf_wq { 7440 7444 __u64 __opaque[2]; 7441 7445 } __attribute__((aligned(8)));
+130
tools/testing/selftests/bpf/prog_tests/task_work_stress.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <string.h> 5 + #include <stdio.h> 6 + #include "task_work_stress.skel.h" 7 + #include <linux/bpf.h> 8 + #include <linux/perf_event.h> 9 + #include <sys/syscall.h> 10 + #include <time.h> 11 + #include <stdlib.h> 12 + #include <stdatomic.h> 13 + 14 + struct test_data { 15 + int prog_fd; 16 + atomic_int exit; 17 + }; 18 + 19 + void *runner(void *test_data) 20 + { 21 + struct test_data *td = test_data; 22 + int err = 0; 23 + LIBBPF_OPTS(bpf_test_run_opts, opts); 24 + 25 + while (!err && !atomic_load(&td->exit)) 26 + err = bpf_prog_test_run_opts(td->prog_fd, &opts); 27 + 28 + return NULL; 29 + } 30 + 31 + static int get_env_int(const char *str, int def) 32 + { 33 + const char *s = getenv(str); 34 + char *end; 35 + int retval; 36 + 37 + if (!s || !*s) 38 + return def; 39 + errno = 0; 40 + retval = strtol(s, &end, 10); 41 + if (errno || *end || retval < 0) 42 + return def; 43 + return retval; 44 + } 45 + 46 + static void task_work_run(bool enable_delete) 47 + { 48 + struct task_work_stress *skel; 49 + struct bpf_program *scheduler, *deleter; 50 + int nthreads = 16; 51 + int test_time_s = get_env_int("BPF_TASK_WORK_TEST_TIME", 1); 52 + pthread_t tid[nthreads], tid_del; 53 + bool started[nthreads], started_del = false; 54 + struct test_data td_sched = { .exit = 0 }, td_del = { .exit = 1 }; 55 + int i, err; 56 + 57 + skel = task_work_stress__open(); 58 + if (!ASSERT_OK_PTR(skel, "task_work__open")) 59 + return; 60 + 61 + scheduler = bpf_object__find_program_by_name(skel->obj, "schedule_task_work"); 62 + bpf_program__set_autoload(scheduler, true); 63 + 64 + deleter = bpf_object__find_program_by_name(skel->obj, "delete_task_work"); 65 + bpf_program__set_autoload(deleter, true); 66 + 67 + err = task_work_stress__load(skel); 68 + if (!ASSERT_OK(err, "skel_load")) 69 + goto cleanup; 70 + 71 + for (i = 0; i < nthreads; ++i) 72 + started[i] = false; 73 + 74 + td_sched.prog_fd = bpf_program__fd(scheduler); 75 + for (i = 0; i < nthreads; ++i) { 76 + if (pthread_create(&tid[i], NULL, runner, &td_sched) != 0) { 77 + fprintf(stderr, "could not start thread"); 78 + goto cancel; 79 + } 80 + started[i] = true; 81 + } 82 + 83 + if (enable_delete) 84 + atomic_store(&td_del.exit, 0); 85 + 86 + td_del.prog_fd = bpf_program__fd(deleter); 87 + if (pthread_create(&tid_del, NULL, runner, &td_del) != 0) { 88 + fprintf(stderr, "could not start thread"); 89 + goto cancel; 90 + } 91 + started_del = true; 92 + 93 + /* Run stress test for some time */ 94 + sleep(test_time_s); 95 + 96 + cancel: 97 + atomic_store(&td_sched.exit, 1); 98 + atomic_store(&td_del.exit, 1); 99 + for (i = 0; i < nthreads; ++i) { 100 + if (started[i]) 101 + pthread_join(tid[i], NULL); 102 + } 103 + 104 + if (started_del) 105 + pthread_join(tid_del, NULL); 106 + 107 + ASSERT_GT(skel->bss->callback_scheduled, 0, "work scheduled"); 108 + /* Some scheduling attempts should have failed due to contention */ 109 + ASSERT_GT(skel->bss->schedule_error, 0, "schedule error"); 110 + 111 + if (enable_delete) { 112 + /* If delete thread is enabled, it has cancelled some callbacks */ 113 + ASSERT_GT(skel->bss->delete_success, 0, "delete success"); 114 + ASSERT_LT(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); 115 + } else { 116 + /* Without delete thread number of scheduled callbacks is the same as fired */ 117 + ASSERT_EQ(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); 118 + } 119 + 120 + cleanup: 121 + task_work_stress__destroy(skel); 122 + } 123 + 124 + void test_task_work_stress(void) 125 + { 126 + if (test__start_subtest("no_delete")) 127 + task_work_run(false); 128 + if (test__start_subtest("with_delete")) 129 + task_work_run(true); 130 + }
+150
tools/testing/selftests/bpf/prog_tests/test_task_work.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <string.h> 5 + #include <stdio.h> 6 + #include "task_work.skel.h" 7 + #include "task_work_fail.skel.h" 8 + #include <linux/bpf.h> 9 + #include <linux/perf_event.h> 10 + #include <sys/syscall.h> 11 + #include <time.h> 12 + 13 + static int perf_event_open(__u32 type, __u64 config, int pid) 14 + { 15 + struct perf_event_attr attr = { 16 + .type = type, 17 + .config = config, 18 + .size = sizeof(struct perf_event_attr), 19 + .sample_period = 100000, 20 + }; 21 + 22 + return syscall(__NR_perf_event_open, &attr, pid, -1, -1, 0); 23 + } 24 + 25 + struct elem { 26 + char data[128]; 27 + struct bpf_task_work tw; 28 + }; 29 + 30 + static int verify_map(struct bpf_map *map, const char *expected_data) 31 + { 32 + int err; 33 + struct elem value; 34 + int processed_values = 0; 35 + int k, sz; 36 + 37 + sz = bpf_map__max_entries(map); 38 + for (k = 0; k < sz; ++k) { 39 + err = bpf_map__lookup_elem(map, &k, sizeof(int), &value, sizeof(struct elem), 0); 40 + if (err) 41 + continue; 42 + if (!ASSERT_EQ(strcmp(expected_data, value.data), 0, "map data")) { 43 + fprintf(stderr, "expected '%s', found '%s' in %s map", expected_data, 44 + value.data, bpf_map__name(map)); 45 + return 2; 46 + } 47 + processed_values++; 48 + } 49 + 50 + return processed_values == 0; 51 + } 52 + 53 + static void task_work_run(const char *prog_name, const char *map_name) 54 + { 55 + struct task_work *skel; 56 + struct bpf_program *prog; 57 + struct bpf_map *map; 58 + struct bpf_link *link; 59 + int err, pe_fd = 0, pid, status, pipefd[2]; 60 + char user_string[] = "hello world"; 61 + 62 + if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) 63 + return; 64 + 65 + pid = fork(); 66 + if (pid == 0) { 67 + __u64 num = 1; 68 + int i; 69 + char buf; 70 + 71 + close(pipefd[1]); 72 + read(pipefd[0], &buf, sizeof(buf)); 73 + close(pipefd[0]); 74 + 75 + for (i = 0; i < 10000; ++i) 76 + num *= time(0) % 7; 77 + (void)num; 78 + exit(0); 79 + } 80 + ASSERT_GT(pid, 0, "fork() failed"); 81 + 82 + skel = task_work__open(); 83 + if (!ASSERT_OK_PTR(skel, "task_work__open")) 84 + return; 85 + 86 + bpf_object__for_each_program(prog, skel->obj) { 87 + bpf_program__set_autoload(prog, false); 88 + } 89 + 90 + prog = bpf_object__find_program_by_name(skel->obj, prog_name); 91 + if (!ASSERT_OK_PTR(prog, "prog_name")) 92 + goto cleanup; 93 + bpf_program__set_autoload(prog, true); 94 + skel->bss->user_ptr = (char *)user_string; 95 + 96 + err = task_work__load(skel); 97 + if (!ASSERT_OK(err, "skel_load")) 98 + goto cleanup; 99 + 100 + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, pid); 101 + if (pe_fd == -1 && (errno == ENOENT || errno == EOPNOTSUPP)) { 102 + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); 103 + test__skip(); 104 + goto cleanup; 105 + } 106 + if (!ASSERT_NEQ(pe_fd, -1, "pe_fd")) { 107 + fprintf(stderr, "perf_event_open errno: %d, pid: %d\n", errno, pid); 108 + goto cleanup; 109 + } 110 + 111 + link = bpf_program__attach_perf_event(prog, pe_fd); 112 + if (!ASSERT_OK_PTR(link, "attach_perf_event")) 113 + goto cleanup; 114 + 115 + close(pipefd[0]); 116 + write(pipefd[1], user_string, 1); 117 + close(pipefd[1]); 118 + /* Wait to collect some samples */ 119 + waitpid(pid, &status, 0); 120 + pid = 0; 121 + map = bpf_object__find_map_by_name(skel->obj, map_name); 122 + if (!ASSERT_OK_PTR(map, "find map_name")) 123 + goto cleanup; 124 + if (!ASSERT_OK(verify_map(map, user_string), "verify map")) 125 + goto cleanup; 126 + cleanup: 127 + if (pe_fd >= 0) 128 + close(pe_fd); 129 + task_work__destroy(skel); 130 + if (pid) { 131 + close(pipefd[0]); 132 + write(pipefd[1], user_string, 1); 133 + close(pipefd[1]); 134 + waitpid(pid, &status, 0); 135 + } 136 + } 137 + 138 + void test_task_work(void) 139 + { 140 + if (test__start_subtest("test_task_work_hash_map")) 141 + task_work_run("oncpu_hash_map", "hmap"); 142 + 143 + if (test__start_subtest("test_task_work_array_map")) 144 + task_work_run("oncpu_array_map", "arrmap"); 145 + 146 + if (test__start_subtest("test_task_work_lru_map")) 147 + task_work_run("oncpu_lru_map", "lrumap"); 148 + 149 + RUN_TESTS(task_work_fail); 150 + }
+107
tools/testing/selftests/bpf/progs/task_work.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + 4 + #include <vmlinux.h> 5 + #include <string.h> 6 + #include <stdbool.h> 7 + #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 9 + #include "bpf_misc.h" 10 + #include "errno.h" 11 + 12 + char _license[] SEC("license") = "GPL"; 13 + 14 + const void *user_ptr = NULL; 15 + 16 + struct elem { 17 + char data[128]; 18 + struct bpf_task_work tw; 19 + }; 20 + 21 + struct { 22 + __uint(type, BPF_MAP_TYPE_HASH); 23 + __uint(map_flags, BPF_F_NO_PREALLOC); 24 + __uint(max_entries, 1); 25 + __type(key, int); 26 + __type(value, struct elem); 27 + } hmap SEC(".maps"); 28 + 29 + struct { 30 + __uint(type, BPF_MAP_TYPE_ARRAY); 31 + __uint(max_entries, 1); 32 + __type(key, int); 33 + __type(value, struct elem); 34 + } arrmap SEC(".maps"); 35 + 36 + struct { 37 + __uint(type, BPF_MAP_TYPE_LRU_HASH); 38 + __uint(max_entries, 1); 39 + __type(key, int); 40 + __type(value, struct elem); 41 + } lrumap SEC(".maps"); 42 + 43 + static int process_work(struct bpf_map *map, void *key, void *value) 44 + { 45 + struct elem *work = value; 46 + 47 + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); 48 + return 0; 49 + } 50 + 51 + int key = 0; 52 + 53 + SEC("perf_event") 54 + int oncpu_hash_map(struct pt_regs *args) 55 + { 56 + struct elem empty_work = { .data = { 0 } }; 57 + struct elem *work; 58 + struct task_struct *task; 59 + int err; 60 + 61 + task = bpf_get_current_task_btf(); 62 + err = bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); 63 + if (err) 64 + return 0; 65 + work = bpf_map_lookup_elem(&hmap, &key); 66 + if (!work) 67 + return 0; 68 + 69 + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); 70 + return 0; 71 + } 72 + 73 + SEC("perf_event") 74 + int oncpu_array_map(struct pt_regs *args) 75 + { 76 + struct elem *work; 77 + struct task_struct *task; 78 + 79 + task = bpf_get_current_task_btf(); 80 + work = bpf_map_lookup_elem(&arrmap, &key); 81 + if (!work) 82 + return 0; 83 + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); 84 + return 0; 85 + } 86 + 87 + SEC("perf_event") 88 + int oncpu_lru_map(struct pt_regs *args) 89 + { 90 + struct elem empty_work = { .data = { 0 } }; 91 + struct elem *work; 92 + struct task_struct *task; 93 + int err; 94 + 95 + task = bpf_get_current_task_btf(); 96 + work = bpf_map_lookup_elem(&lrumap, &key); 97 + if (work) 98 + return 0; 99 + err = bpf_map_update_elem(&lrumap, &key, &empty_work, BPF_NOEXIST); 100 + if (err) 101 + return 0; 102 + work = bpf_map_lookup_elem(&lrumap, &key); 103 + if (!work || work->data[0]) 104 + return 0; 105 + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); 106 + return 0; 107 + }
+96
tools/testing/selftests/bpf/progs/task_work_fail.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + 4 + #include <vmlinux.h> 5 + #include <string.h> 6 + #include <stdbool.h> 7 + #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 9 + #include "bpf_misc.h" 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + const void *user_ptr = NULL; 14 + 15 + struct elem { 16 + char data[128]; 17 + struct bpf_task_work tw; 18 + }; 19 + 20 + struct { 21 + __uint(type, BPF_MAP_TYPE_HASH); 22 + __uint(map_flags, BPF_F_NO_PREALLOC); 23 + __uint(max_entries, 1); 24 + __type(key, int); 25 + __type(value, struct elem); 26 + } hmap SEC(".maps"); 27 + 28 + struct { 29 + __uint(type, BPF_MAP_TYPE_ARRAY); 30 + __uint(max_entries, 1); 31 + __type(key, int); 32 + __type(value, struct elem); 33 + } arrmap SEC(".maps"); 34 + 35 + static int process_work(struct bpf_map *map, void *key, void *value) 36 + { 37 + struct elem *work = value; 38 + 39 + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); 40 + return 0; 41 + } 42 + 43 + int key = 0; 44 + 45 + SEC("perf_event") 46 + __failure __msg("doesn't match map pointer in R3") 47 + int mismatch_map(struct pt_regs *args) 48 + { 49 + struct elem *work; 50 + struct task_struct *task; 51 + 52 + task = bpf_get_current_task_btf(); 53 + work = bpf_map_lookup_elem(&arrmap, &key); 54 + if (!work) 55 + return 0; 56 + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); 57 + return 0; 58 + } 59 + 60 + SEC("perf_event") 61 + __failure __msg("arg#1 doesn't point to a map value") 62 + int no_map_task_work(struct pt_regs *args) 63 + { 64 + struct task_struct *task; 65 + struct bpf_task_work tw; 66 + 67 + task = bpf_get_current_task_btf(); 68 + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); 69 + return 0; 70 + } 71 + 72 + SEC("perf_event") 73 + __failure __msg("Possibly NULL pointer passed to trusted arg1") 74 + int task_work_null(struct pt_regs *args) 75 + { 76 + struct task_struct *task; 77 + 78 + task = bpf_get_current_task_btf(); 79 + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); 80 + return 0; 81 + } 82 + 83 + SEC("perf_event") 84 + __failure __msg("Possibly NULL pointer passed to trusted arg2") 85 + int map_null(struct pt_regs *args) 86 + { 87 + struct elem *work; 88 + struct task_struct *task; 89 + 90 + task = bpf_get_current_task_btf(); 91 + work = bpf_map_lookup_elem(&arrmap, &key); 92 + if (!work) 93 + return 0; 94 + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); 95 + return 0; 96 + }
+73
tools/testing/selftests/bpf/progs/task_work_stress.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + 4 + #include <vmlinux.h> 5 + #include <string.h> 6 + #include <stdbool.h> 7 + #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 9 + #include "bpf_misc.h" 10 + 11 + #define ENTRIES 128 12 + 13 + char _license[] SEC("license") = "GPL"; 14 + 15 + __u64 callback_scheduled = 0; 16 + __u64 callback_success = 0; 17 + __u64 schedule_error = 0; 18 + __u64 delete_success = 0; 19 + 20 + struct elem { 21 + __u32 count; 22 + struct bpf_task_work tw; 23 + }; 24 + 25 + struct { 26 + __uint(type, BPF_MAP_TYPE_HASH); 27 + __uint(map_flags, BPF_F_NO_PREALLOC); 28 + __uint(max_entries, ENTRIES); 29 + __type(key, int); 30 + __type(value, struct elem); 31 + } hmap SEC(".maps"); 32 + 33 + static int process_work(struct bpf_map *map, void *key, void *value) 34 + { 35 + __sync_fetch_and_add(&callback_success, 1); 36 + return 0; 37 + } 38 + 39 + SEC("syscall") 40 + int schedule_task_work(void *ctx) 41 + { 42 + struct elem empty_work = {.count = 0}; 43 + struct elem *work; 44 + int key = 0, err; 45 + 46 + key = bpf_ktime_get_ns() % ENTRIES; 47 + work = bpf_map_lookup_elem(&hmap, &key); 48 + if (!work) { 49 + bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); 50 + work = bpf_map_lookup_elem(&hmap, &key); 51 + if (!work) 52 + return 0; 53 + } 54 + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, 55 + process_work, NULL); 56 + if (err) 57 + __sync_fetch_and_add(&schedule_error, 1); 58 + else 59 + __sync_fetch_and_add(&callback_scheduled, 1); 60 + return 0; 61 + } 62 + 63 + SEC("syscall") 64 + int delete_task_work(void *ctx) 65 + { 66 + int key = 0, err; 67 + 68 + key = bpf_get_prandom_u32() % ENTRIES; 69 + err = bpf_map_delete_elem(&hmap, &key); 70 + if (!err) 71 + __sync_fetch_and_add(&delete_success, 1); 72 + return 0; 73 + }