Merge branch 'bpf-introduce-deferred-task-context-execution'

+11

include/linux/bpf.h

··· 209 209 BPF_WORKQUEUE = (1 << 10), 210 210 BPF_UPTR = (1 << 11), 211 211 BPF_RES_SPIN_LOCK = (1 << 12), 212 + BPF_TASK_WORK = (1 << 13), 212 213 }; 213 214 214 215 enum bpf_cgroup_storage_type { ··· 263 262 int timer_off; 264 263 int wq_off; 265 264 int refcount_off; 265 + int task_work_off; 266 266 struct btf_field fields[]; 267 267 }; 268 268 ··· 365 363 return "bpf_rb_node"; 366 364 case BPF_REFCOUNT: 367 365 return "bpf_refcount"; 366 + case BPF_TASK_WORK: 367 + return "bpf_task_work"; 368 368 default: 369 369 WARN_ON_ONCE(1); 370 370 return "unknown"; ··· 405 401 return sizeof(struct bpf_rb_node); 406 402 case BPF_REFCOUNT: 407 403 return sizeof(struct bpf_refcount); 404 + case BPF_TASK_WORK: 405 + return sizeof(struct bpf_task_work); 408 406 default: 409 407 WARN_ON_ONCE(1); 410 408 return 0; ··· 439 433 return __alignof__(struct bpf_rb_node); 440 434 case BPF_REFCOUNT: 441 435 return __alignof__(struct bpf_refcount); 436 + case BPF_TASK_WORK: 437 + return __alignof__(struct bpf_task_work); 442 438 default: 443 439 WARN_ON_ONCE(1); 444 440 return 0; ··· 472 464 case BPF_KPTR_REF: 473 465 case BPF_KPTR_PERCPU: 474 466 case BPF_UPTR: 467 + case BPF_TASK_WORK: 475 468 break; 476 469 default: 477 470 WARN_ON_ONCE(1); ··· 609 600 bool lock_src); 610 601 void bpf_timer_cancel_and_free(void *timer); 611 602 void bpf_wq_cancel_and_free(void *timer); 603 + void bpf_task_work_cancel_and_free(void *timer); 612 604 void bpf_list_head_free(const struct btf_field *field, void *list_head, 613 605 struct bpf_spin_lock *spin_lock); 614 606 void bpf_rb_root_free(const struct btf_field *field, void *rb_root, ··· 2436 2426 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); 2437 2427 void bpf_obj_free_timer(const struct btf_record *rec, void *obj); 2438 2428 void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); 2429 + void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); 2439 2430 void bpf_obj_free_fields(const struct btf_record *rec, void *obj); 2440 2431 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); 2441 2432

+4

include/uapi/linux/bpf.h

··· 7436 7436 __u64 __opaque[2]; 7437 7437 } __attribute__((aligned(8))); 7438 7438 7439 + struct bpf_task_work { 7440 + __u64 __opaque; 7441 + } __attribute__((aligned(8))); 7442 + 7439 7443 struct bpf_wq { 7440 7444 __u64 __opaque[2]; 7441 7445 } __attribute__((aligned(8)));

+5 -3

kernel/bpf/arraymap.c

··· 443 443 return (void *)round_down((unsigned long)array, PAGE_SIZE); 444 444 } 445 445 446 - static void array_map_free_timers_wq(struct bpf_map *map) 446 + static void array_map_free_internal_structs(struct bpf_map *map) 447 447 { 448 448 struct bpf_array *array = container_of(map, struct bpf_array, map); 449 449 int i; ··· 451 451 /* We don't reset or free fields other than timer and workqueue 452 452 * on uref dropping to zero. 453 453 */ 454 - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { 454 + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { 455 455 for (i = 0; i < array->map.max_entries; i++) { 456 456 if (btf_record_has_field(map->record, BPF_TIMER)) 457 457 bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); 458 458 if (btf_record_has_field(map->record, BPF_WORKQUEUE)) 459 459 bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); 460 + if (btf_record_has_field(map->record, BPF_TASK_WORK)) 461 + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); 460 462 } 461 463 } 462 464 } ··· 797 795 .map_alloc = array_map_alloc, 798 796 .map_free = array_map_free, 799 797 .map_get_next_key = array_map_get_next_key, 800 - .map_release_uref = array_map_free_timers_wq, 798 + .map_release_uref = array_map_free_internal_structs, 801 799 .map_lookup_elem = array_map_lookup_elem, 802 800 .map_update_elem = array_map_update_elem, 803 801 .map_delete_elem = array_map_delete_elem,

+38 -49

kernel/bpf/btf.c

··· 3478 3478 return BTF_FIELD_FOUND; 3479 3479 } 3480 3480 3481 - #define field_mask_test_name(field_type, field_type_str) \ 3482 - if (field_mask & field_type && !strcmp(name, field_type_str)) { \ 3483 - type = field_type; \ 3484 - goto end; \ 3485 - } 3486 - 3487 3481 static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type, 3488 - u32 field_mask, u32 *seen_mask, 3489 - int *align, int *sz) 3482 + u32 field_mask, u32 *seen_mask, int *align, int *sz) 3490 3483 { 3491 - int type = 0; 3484 + const struct { 3485 + enum btf_field_type type; 3486 + const char *const name; 3487 + const bool is_unique; 3488 + } field_types[] = { 3489 + { BPF_SPIN_LOCK, "bpf_spin_lock", true }, 3490 + { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, 3491 + { BPF_TIMER, "bpf_timer", true }, 3492 + { BPF_WORKQUEUE, "bpf_wq", true }, 3493 + { BPF_TASK_WORK, "bpf_task_work", true }, 3494 + { BPF_LIST_HEAD, "bpf_list_head", false }, 3495 + { BPF_LIST_NODE, "bpf_list_node", false }, 3496 + { BPF_RB_ROOT, "bpf_rb_root", false }, 3497 + { BPF_RB_NODE, "bpf_rb_node", false }, 3498 + { BPF_REFCOUNT, "bpf_refcount", false }, 3499 + }; 3500 + int type = 0, i; 3492 3501 const char *name = __btf_name_by_offset(btf, var_type->name_off); 3502 + const char *field_type_name; 3503 + enum btf_field_type field_type; 3504 + bool is_unique; 3493 3505 3494 - if (field_mask & BPF_SPIN_LOCK) { 3495 - if (!strcmp(name, "bpf_spin_lock")) { 3496 - if (*seen_mask & BPF_SPIN_LOCK) 3506 + for (i = 0; i < ARRAY_SIZE(field_types); ++i) { 3507 + field_type = field_types[i].type; 3508 + field_type_name = field_types[i].name; 3509 + is_unique = field_types[i].is_unique; 3510 + if (!(field_mask & field_type) || strcmp(name, field_type_name)) 3511 + continue; 3512 + if (is_unique) { 3513 + if (*seen_mask & field_type) 3497 3514 return -E2BIG; 3498 - *seen_mask |= BPF_SPIN_LOCK; 3499 - type = BPF_SPIN_LOCK; 3500 - goto end; 3515 + *seen_mask |= field_type; 3501 3516 } 3517 + type = field_type; 3518 + goto end; 3502 3519 } 3503 - if (field_mask & BPF_RES_SPIN_LOCK) { 3504 - if (!strcmp(name, "bpf_res_spin_lock")) { 3505 - if (*seen_mask & BPF_RES_SPIN_LOCK) 3506 - return -E2BIG; 3507 - *seen_mask |= BPF_RES_SPIN_LOCK; 3508 - type = BPF_RES_SPIN_LOCK; 3509 - goto end; 3510 - } 3511 - } 3512 - if (field_mask & BPF_TIMER) { 3513 - if (!strcmp(name, "bpf_timer")) { 3514 - if (*seen_mask & BPF_TIMER) 3515 - return -E2BIG; 3516 - *seen_mask |= BPF_TIMER; 3517 - type = BPF_TIMER; 3518 - goto end; 3519 - } 3520 - } 3521 - if (field_mask & BPF_WORKQUEUE) { 3522 - if (!strcmp(name, "bpf_wq")) { 3523 - if (*seen_mask & BPF_WORKQUEUE) 3524 - return -E2BIG; 3525 - *seen_mask |= BPF_WORKQUEUE; 3526 - type = BPF_WORKQUEUE; 3527 - goto end; 3528 - } 3529 - } 3530 - field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); 3531 - field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); 3532 - field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); 3533 - field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); 3534 - field_mask_test_name(BPF_REFCOUNT, "bpf_refcount"); 3535 3520 3536 3521 /* Only return BPF_KPTR when all other types with matchable names fail */ 3537 3522 if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) { ··· 3529 3544 *align = btf_field_type_align(type); 3530 3545 return type; 3531 3546 } 3532 - 3533 - #undef field_mask_test_name 3534 3547 3535 3548 /* Repeat a number of fields for a specified number of times. 3536 3549 * ··· 3676 3693 case BPF_LIST_NODE: 3677 3694 case BPF_RB_NODE: 3678 3695 case BPF_REFCOUNT: 3696 + case BPF_TASK_WORK: 3679 3697 ret = btf_find_struct(btf, var_type, off, sz, field_type, 3680 3698 info_cnt ? &info[0] : &tmp); 3681 3699 if (ret < 0) ··· 3969 3985 rec->timer_off = -EINVAL; 3970 3986 rec->wq_off = -EINVAL; 3971 3987 rec->refcount_off = -EINVAL; 3988 + rec->task_work_off = -EINVAL; 3972 3989 for (i = 0; i < cnt; i++) { 3973 3990 field_type_size = btf_field_type_size(info_arr[i].type); 3974 3991 if (info_arr[i].off + field_type_size > value_size) { ··· 4008 4023 WARN_ON_ONCE(rec->wq_off >= 0); 4009 4024 /* Cache offset for faster lookup at runtime */ 4010 4025 rec->wq_off = rec->fields[i].offset; 4026 + break; 4027 + case BPF_TASK_WORK: 4028 + WARN_ON_ONCE(rec->task_work_off >= 0); 4029 + rec->task_work_off = rec->fields[i].offset; 4011 4030 break; 4012 4031 case BPF_REFCOUNT: 4013 4032 WARN_ON_ONCE(rec->refcount_off >= 0);

+23 -20

kernel/bpf/hashtab.c

··· 215 215 return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); 216 216 } 217 217 218 - static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) 218 + static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem) 219 + { 220 + if (btf_record_has_field(htab->map.record, BPF_TIMER)) 221 + bpf_obj_free_timer(htab->map.record, 222 + htab_elem_value(elem, htab->map.key_size)); 223 + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) 224 + bpf_obj_free_workqueue(htab->map.record, 225 + htab_elem_value(elem, htab->map.key_size)); 226 + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) 227 + bpf_obj_free_task_work(htab->map.record, 228 + htab_elem_value(elem, htab->map.key_size)); 229 + } 230 + 231 + static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) 219 232 { 220 233 u32 num_entries = htab->map.max_entries; 221 234 int i; ··· 240 227 struct htab_elem *elem; 241 228 242 229 elem = get_htab_elem(htab, i); 243 - if (btf_record_has_field(htab->map.record, BPF_TIMER)) 244 - bpf_obj_free_timer(htab->map.record, 245 - htab_elem_value(elem, htab->map.key_size)); 246 - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) 247 - bpf_obj_free_workqueue(htab->map.record, 248 - htab_elem_value(elem, htab->map.key_size)); 230 + htab_free_internal_structs(htab, elem); 249 231 cond_resched(); 250 232 } 251 233 } ··· 1498 1490 } 1499 1491 } 1500 1492 1501 - static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) 1493 + static void htab_free_malloced_internal_structs(struct bpf_htab *htab) 1502 1494 { 1503 1495 int i; 1504 1496 ··· 1510 1502 1511 1503 hlist_nulls_for_each_entry(l, n, head, hash_node) { 1512 1504 /* We only free timer on uref dropping to zero */ 1513 - if (btf_record_has_field(htab->map.record, BPF_TIMER)) 1514 - bpf_obj_free_timer(htab->map.record, 1515 - htab_elem_value(l, htab->map.key_size)); 1516 - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) 1517 - bpf_obj_free_workqueue(htab->map.record, 1518 - htab_elem_value(l, htab->map.key_size)); 1505 + htab_free_internal_structs(htab, l); 1519 1506 } 1520 1507 cond_resched_rcu(); 1521 1508 } 1522 1509 rcu_read_unlock(); 1523 1510 } 1524 1511 1525 - static void htab_map_free_timers_and_wq(struct bpf_map *map) 1512 + static void htab_map_free_internal_structs(struct bpf_map *map) 1526 1513 { 1527 1514 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 1528 1515 1529 1516 /* We only free timer and workqueue on uref dropping to zero */ 1530 - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { 1517 + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { 1531 1518 if (!htab_is_prealloc(htab)) 1532 - htab_free_malloced_timers_and_wq(htab); 1519 + htab_free_malloced_internal_structs(htab); 1533 1520 else 1534 - htab_free_prealloced_timers_and_wq(htab); 1521 + htab_free_prealloced_internal_structs(htab); 1535 1522 } 1536 1523 } 1537 1524 ··· 2258 2255 .map_alloc = htab_map_alloc, 2259 2256 .map_free = htab_map_free, 2260 2257 .map_get_next_key = htab_map_get_next_key, 2261 - .map_release_uref = htab_map_free_timers_and_wq, 2258 + .map_release_uref = htab_map_free_internal_structs, 2262 2259 .map_lookup_elem = htab_map_lookup_elem, 2263 2260 .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, 2264 2261 .map_update_elem = htab_map_update_elem, ··· 2279 2276 .map_alloc = htab_map_alloc, 2280 2277 .map_free = htab_map_free, 2281 2278 .map_get_next_key = htab_map_get_next_key, 2282 - .map_release_uref = htab_map_free_timers_and_wq, 2279 + .map_release_uref = htab_map_free_internal_structs, 2283 2280 .map_lookup_elem = htab_lru_map_lookup_elem, 2284 2281 .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, 2285 2282 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,

+341 -17

kernel/bpf/helpers.c

··· 26 26 #include <linux/bpf_verifier.h> 27 27 #include <linux/uaccess.h> 28 28 #include <linux/verification.h> 29 + #include <linux/task_work.h> 30 + #include <linux/irq_work.h> 29 31 30 32 #include "../../lib/kstrtox.h" 31 33 ··· 1084 1082 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 1085 1083 }; 1086 1084 1085 + static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) 1086 + { 1087 + if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1088 + struct bpf_array *array = container_of(map, struct bpf_array, map); 1089 + 1090 + *arr_idx = ((char *)value - array->value) / array->elem_size; 1091 + return arr_idx; 1092 + } 1093 + return (void *)value - round_up(map->key_size, 8); 1094 + } 1095 + 1087 1096 struct bpf_async_cb { 1088 1097 struct bpf_map *map; 1089 1098 struct bpf_prog *prog; ··· 1177 1164 * bpf_map_delete_elem() on the same timer. 1178 1165 */ 1179 1166 this_cpu_write(hrtimer_running, t); 1180 - if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1181 - struct bpf_array *array = container_of(map, struct bpf_array, map); 1182 1167 1183 - /* compute the key */ 1184 - idx = ((char *)value - array->value) / array->elem_size; 1185 - key = &idx; 1186 - } else { /* hash or lru */ 1187 - key = value - round_up(map->key_size, 8); 1188 - } 1168 + key = map_key_from_value(map, value, &idx); 1189 1169 1190 1170 callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0); 1191 1171 /* The verifier checked that return value is zero. */ ··· 1204 1198 if (!callback_fn) 1205 1199 return; 1206 1200 1207 - if (map->map_type == BPF_MAP_TYPE_ARRAY) { 1208 - struct bpf_array *array = container_of(map, struct bpf_array, map); 1209 - 1210 - /* compute the key */ 1211 - idx = ((char *)value - array->value) / array->elem_size; 1212 - key = &idx; 1213 - } else { /* hash or lru */ 1214 - key = value - round_up(map->key_size, 8); 1215 - } 1201 + key = map_key_from_value(map, value, &idx); 1216 1202 1217 1203 rcu_read_lock_trace(); 1218 1204 migrate_disable(); ··· 3904 3906 } 3905 3907 #endif /* CONFIG_KEYS */ 3906 3908 3909 + typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); 3910 + 3911 + enum bpf_task_work_state { 3912 + /* bpf_task_work is ready to be used */ 3913 + BPF_TW_STANDBY = 0, 3914 + /* irq work scheduling in progress */ 3915 + BPF_TW_PENDING, 3916 + /* task work scheduling in progress */ 3917 + BPF_TW_SCHEDULING, 3918 + /* task work is scheduled successfully */ 3919 + BPF_TW_SCHEDULED, 3920 + /* callback is running */ 3921 + BPF_TW_RUNNING, 3922 + /* associated BPF map value is deleted */ 3923 + BPF_TW_FREED, 3924 + }; 3925 + 3926 + struct bpf_task_work_ctx { 3927 + enum bpf_task_work_state state; 3928 + refcount_t refcnt; 3929 + struct callback_head work; 3930 + struct irq_work irq_work; 3931 + /* bpf_prog that schedules task work */ 3932 + struct bpf_prog *prog; 3933 + /* task for which callback is scheduled */ 3934 + struct task_struct *task; 3935 + /* the map and map value associated with this context */ 3936 + struct bpf_map *map; 3937 + void *map_val; 3938 + enum task_work_notify_mode mode; 3939 + bpf_task_work_callback_t callback_fn; 3940 + struct rcu_head rcu; 3941 + } __aligned(8); 3942 + 3943 + /* Actual type for struct bpf_task_work */ 3944 + struct bpf_task_work_kern { 3945 + struct bpf_task_work_ctx *ctx; 3946 + }; 3947 + 3948 + static void bpf_task_work_ctx_reset(struct bpf_task_work_ctx *ctx) 3949 + { 3950 + if (ctx->prog) { 3951 + bpf_prog_put(ctx->prog); 3952 + ctx->prog = NULL; 3953 + } 3954 + if (ctx->task) { 3955 + bpf_task_release(ctx->task); 3956 + ctx->task = NULL; 3957 + } 3958 + } 3959 + 3960 + static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) 3961 + { 3962 + return refcount_inc_not_zero(&ctx->refcnt); 3963 + } 3964 + 3965 + static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) 3966 + { 3967 + if (!refcount_dec_and_test(&ctx->refcnt)) 3968 + return; 3969 + 3970 + bpf_task_work_ctx_reset(ctx); 3971 + 3972 + /* bpf_mem_free expects migration to be disabled */ 3973 + migrate_disable(); 3974 + bpf_mem_free(&bpf_global_ma, ctx); 3975 + migrate_enable(); 3976 + } 3977 + 3978 + static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) 3979 + { 3980 + /* 3981 + * Scheduled task_work callback holds ctx ref, so if we successfully 3982 + * cancelled, we put that ref on callback's behalf. If we couldn't 3983 + * cancel, callback will inevitably run or has already completed 3984 + * running, and it would have taken care of its ctx ref itself. 3985 + */ 3986 + if (task_work_cancel(ctx->task, &ctx->work)) 3987 + bpf_task_work_ctx_put(ctx); 3988 + } 3989 + 3990 + static void bpf_task_work_callback(struct callback_head *cb) 3991 + { 3992 + struct bpf_task_work_ctx *ctx = container_of(cb, struct bpf_task_work_ctx, work); 3993 + enum bpf_task_work_state state; 3994 + u32 idx; 3995 + void *key; 3996 + 3997 + /* Read lock is needed to protect ctx and map key/value access */ 3998 + guard(rcu_tasks_trace)(); 3999 + /* 4000 + * This callback may start running before bpf_task_work_irq() switched to 4001 + * SCHEDULED state, so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING. 4002 + */ 4003 + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING); 4004 + if (state == BPF_TW_SCHEDULED) 4005 + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING); 4006 + if (state == BPF_TW_FREED) { 4007 + bpf_task_work_ctx_put(ctx); 4008 + return; 4009 + } 4010 + 4011 + key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx); 4012 + 4013 + migrate_disable(); 4014 + ctx->callback_fn(ctx->map, key, ctx->map_val); 4015 + migrate_enable(); 4016 + 4017 + bpf_task_work_ctx_reset(ctx); 4018 + (void)cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY); 4019 + 4020 + bpf_task_work_ctx_put(ctx); 4021 + } 4022 + 4023 + static void bpf_task_work_irq(struct irq_work *irq_work) 4024 + { 4025 + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); 4026 + enum bpf_task_work_state state; 4027 + int err; 4028 + 4029 + guard(rcu_tasks_trace)(); 4030 + 4031 + if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { 4032 + bpf_task_work_ctx_put(ctx); 4033 + return; 4034 + } 4035 + 4036 + err = task_work_add(ctx->task, &ctx->work, ctx->mode); 4037 + if (err) { 4038 + bpf_task_work_ctx_reset(ctx); 4039 + /* 4040 + * try to switch back to STANDBY for another task_work reuse, but we might have 4041 + * gone to FREED already, which is fine as we already cleaned up after ourselves 4042 + */ 4043 + (void)cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY); 4044 + bpf_task_work_ctx_put(ctx); 4045 + return; 4046 + } 4047 + 4048 + /* 4049 + * It's technically possible for just scheduled task_work callback to 4050 + * complete running by now, going SCHEDULING -> RUNNING and then 4051 + * dropping its ctx refcount. Instead of capturing extra ref just to 4052 + * protected below ctx->state access, we rely on RCU protection to 4053 + * perform below SCHEDULING -> SCHEDULED attempt. 4054 + */ 4055 + state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); 4056 + if (state == BPF_TW_FREED) 4057 + bpf_task_work_cancel(ctx); /* clean up if we switched into FREED state */ 4058 + } 4059 + 4060 + static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *tw, 4061 + struct bpf_map *map) 4062 + { 4063 + struct bpf_task_work_kern *twk = (void *)tw; 4064 + struct bpf_task_work_ctx *ctx, *old_ctx; 4065 + 4066 + ctx = READ_ONCE(twk->ctx); 4067 + if (ctx) 4068 + return ctx; 4069 + 4070 + ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); 4071 + if (!ctx) 4072 + return ERR_PTR(-ENOMEM); 4073 + 4074 + memset(ctx, 0, sizeof(*ctx)); 4075 + refcount_set(&ctx->refcnt, 1); /* map's own ref */ 4076 + ctx->state = BPF_TW_STANDBY; 4077 + 4078 + old_ctx = cmpxchg(&twk->ctx, NULL, ctx); 4079 + if (old_ctx) { 4080 + /* 4081 + * tw->ctx is set by concurrent BPF program, release allocated 4082 + * memory and try to reuse already set context. 4083 + */ 4084 + bpf_mem_free(&bpf_global_ma, ctx); 4085 + return old_ctx; 4086 + } 4087 + 4088 + return ctx; /* Success */ 4089 + } 4090 + 4091 + static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work *tw, 4092 + struct bpf_map *map) 4093 + { 4094 + struct bpf_task_work_ctx *ctx; 4095 + 4096 + ctx = bpf_task_work_fetch_ctx(tw, map); 4097 + if (IS_ERR(ctx)) 4098 + return ctx; 4099 + 4100 + /* try to get ref for task_work callback to hold */ 4101 + if (!bpf_task_work_ctx_tryget(ctx)) 4102 + return ERR_PTR(-EBUSY); 4103 + 4104 + if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { 4105 + /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ 4106 + bpf_task_work_ctx_put(ctx); 4107 + return ERR_PTR(-EBUSY); 4108 + } 4109 + 4110 + /* 4111 + * If no process or bpffs is holding a reference to the map, no new callbacks should be 4112 + * scheduled. This does not address any race or correctness issue, but rather is a policy 4113 + * choice: dropping user references should stop everything. 4114 + */ 4115 + if (!atomic64_read(&map->usercnt)) { 4116 + /* drop ref we just got for task_work callback itself */ 4117 + bpf_task_work_ctx_put(ctx); 4118 + /* transfer map's ref into cancel_and_free() */ 4119 + bpf_task_work_cancel_and_free(tw); 4120 + return ERR_PTR(-EBUSY); 4121 + } 4122 + 4123 + return ctx; 4124 + } 4125 + 4126 + static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw, 4127 + struct bpf_map *map, bpf_task_work_callback_t callback_fn, 4128 + struct bpf_prog_aux *aux, enum task_work_notify_mode mode) 4129 + { 4130 + struct bpf_prog *prog; 4131 + struct bpf_task_work_ctx *ctx; 4132 + int err; 4133 + 4134 + BTF_TYPE_EMIT(struct bpf_task_work); 4135 + 4136 + prog = bpf_prog_inc_not_zero(aux->prog); 4137 + if (IS_ERR(prog)) 4138 + return -EBADF; 4139 + task = bpf_task_acquire(task); 4140 + if (!task) { 4141 + err = -EBADF; 4142 + goto release_prog; 4143 + } 4144 + 4145 + ctx = bpf_task_work_acquire_ctx(tw, map); 4146 + if (IS_ERR(ctx)) { 4147 + err = PTR_ERR(ctx); 4148 + goto release_all; 4149 + } 4150 + 4151 + ctx->task = task; 4152 + ctx->callback_fn = callback_fn; 4153 + ctx->prog = prog; 4154 + ctx->mode = mode; 4155 + ctx->map = map; 4156 + ctx->map_val = (void *)tw - map->record->task_work_off; 4157 + init_task_work(&ctx->work, bpf_task_work_callback); 4158 + init_irq_work(&ctx->irq_work, bpf_task_work_irq); 4159 + 4160 + irq_work_queue(&ctx->irq_work); 4161 + return 0; 4162 + 4163 + release_all: 4164 + bpf_task_release(task); 4165 + release_prog: 4166 + bpf_prog_put(prog); 4167 + return err; 4168 + } 4169 + 4170 + /** 4171 + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode 4172 + * @task: Task struct for which callback should be scheduled 4173 + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping 4174 + * @map__map: bpf_map that embeds struct bpf_task_work in the values 4175 + * @callback: pointer to BPF subprogram to call 4176 + * @aux__prog: user should pass NULL 4177 + * 4178 + * Return: 0 if task work has been scheduled successfully, negative error code otherwise 4179 + */ 4180 + __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, 4181 + void *map__map, bpf_task_work_callback_t callback, 4182 + void *aux__prog) 4183 + { 4184 + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); 4185 + } 4186 + 4187 + /** 4188 + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode 4189 + * @task: Task struct for which callback should be scheduled 4190 + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping 4191 + * @map__map: bpf_map that embeds struct bpf_task_work in the values 4192 + * @callback: pointer to BPF subprogram to call 4193 + * @aux__prog: user should pass NULL 4194 + * 4195 + * Return: 0 if task work has been scheduled successfully, negative error code otherwise 4196 + */ 4197 + __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, 4198 + void *map__map, bpf_task_work_callback_t callback, 4199 + void *aux__prog) 4200 + { 4201 + return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); 4202 + } 4203 + 3907 4204 __bpf_kfunc_end_defs(); 4205 + 4206 + static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) 4207 + { 4208 + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); 4209 + 4210 + bpf_task_work_cancel(ctx); /* this might put task_work callback's ref */ 4211 + bpf_task_work_ctx_put(ctx); /* and here we put map's own ref that was transferred to us */ 4212 + } 4213 + 4214 + void bpf_task_work_cancel_and_free(void *val) 4215 + { 4216 + struct bpf_task_work_kern *twk = val; 4217 + struct bpf_task_work_ctx *ctx; 4218 + enum bpf_task_work_state state; 4219 + 4220 + ctx = xchg(&twk->ctx, NULL); 4221 + if (!ctx) 4222 + return; 4223 + 4224 + state = xchg(&ctx->state, BPF_TW_FREED); 4225 + if (state == BPF_TW_SCHEDULED) { 4226 + /* run in irq_work to avoid locks in NMI */ 4227 + init_irq_work(&ctx->irq_work, bpf_task_work_cancel_scheduled); 4228 + irq_work_queue(&ctx->irq_work); 4229 + return; 4230 + } 4231 + 4232 + bpf_task_work_ctx_put(ctx); /* put bpf map's ref */ 4233 + } 3908 4234 3909 4235 BTF_KFUNCS_START(generic_btf_ids) 3910 4236 #ifdef CONFIG_CRASH_DUMP ··· 4372 4050 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) 4373 4051 #endif 4374 4052 BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) 4053 + BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) 4054 + BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) 4375 4055 BTF_KFUNCS_END(common_btf_ids) 4376 4056 4377 4057 static const struct btf_kfunc_id_set common_kfunc_set = {

+15 -1

kernel/bpf/syscall.c

··· 674 674 case BPF_TIMER: 675 675 case BPF_REFCOUNT: 676 676 case BPF_WORKQUEUE: 677 + case BPF_TASK_WORK: 677 678 /* Nothing to release */ 678 679 break; 679 680 default: ··· 728 727 case BPF_TIMER: 729 728 case BPF_REFCOUNT: 730 729 case BPF_WORKQUEUE: 730 + case BPF_TASK_WORK: 731 731 /* Nothing to acquire */ 732 732 break; 733 733 default: ··· 787 785 bpf_wq_cancel_and_free(obj + rec->wq_off); 788 786 } 789 787 788 + void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) 789 + { 790 + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) 791 + return; 792 + bpf_task_work_cancel_and_free(obj + rec->task_work_off); 793 + } 794 + 790 795 void bpf_obj_free_fields(const struct btf_record *rec, void *obj) 791 796 { 792 797 const struct btf_field *fields; ··· 817 808 break; 818 809 case BPF_WORKQUEUE: 819 810 bpf_wq_cancel_and_free(field_ptr); 811 + break; 812 + case BPF_TASK_WORK: 813 + bpf_task_work_cancel_and_free(field_ptr); 820 814 break; 821 815 case BPF_KPTR_UNREF: 822 816 WRITE_ONCE(*(u64 *)field_ptr, 0); ··· 1252 1240 1253 1241 map->record = btf_parse_fields(btf, value_type, 1254 1242 BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | 1255 - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, 1243 + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | 1244 + BPF_TASK_WORK, 1256 1245 map->value_size); 1257 1246 if (!IS_ERR_OR_NULL(map->record)) { 1258 1247 int i; ··· 1285 1272 break; 1286 1273 case BPF_TIMER: 1287 1274 case BPF_WORKQUEUE: 1275 + case BPF_TASK_WORK: 1288 1276 if (map->map_type != BPF_MAP_TYPE_HASH && 1289 1277 map->map_type != BPF_MAP_TYPE_LRU_HASH && 1290 1278 map->map_type != BPF_MAP_TYPE_ARRAY) {

+149 -20

kernel/bpf/verifier.c

··· 2224 2224 /* transfer reg's id which is unique for every map_lookup_elem 2225 2225 * as UID of the inner map. 2226 2226 */ 2227 - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) 2227 + if (btf_record_has_field(map->inner_map_meta->record, 2228 + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { 2228 2229 reg->map_uid = reg->id; 2229 - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) 2230 - reg->map_uid = reg->id; 2230 + } 2231 2231 } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { 2232 2232 reg->type = PTR_TO_XDP_SOCK; 2233 2233 } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || ··· 8431 8431 return 0; 8432 8432 } 8433 8433 8434 - static int process_timer_func(struct bpf_verifier_env *env, int regno, 8435 - struct bpf_call_arg_meta *meta) 8434 + /* Check if @regno is a pointer to a specific field in a map value */ 8435 + static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, 8436 + enum btf_field_type field_type) 8436 8437 { 8437 8438 struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8438 8439 bool is_const = tnum_is_const(reg->var_off); 8439 8440 struct bpf_map *map = reg->map_ptr; 8440 8441 u64 val = reg->var_off.value; 8442 + const char *struct_name = btf_field_type_name(field_type); 8443 + int field_off = -1; 8441 8444 8442 8445 if (!is_const) { 8443 8446 verbose(env, 8444 - "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n", 8445 - regno); 8447 + "R%d doesn't have constant offset. %s has to be at the constant offset\n", 8448 + regno, struct_name); 8446 8449 return -EINVAL; 8447 8450 } 8448 8451 if (!map->btf) { 8449 - verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n", 8450 - map->name); 8452 + verbose(env, "map '%s' has to have BTF in order to use %s\n", map->name, 8453 + struct_name); 8451 8454 return -EINVAL; 8452 8455 } 8453 - if (!btf_record_has_field(map->record, BPF_TIMER)) { 8454 - verbose(env, "map '%s' has no valid bpf_timer\n", map->name); 8456 + if (!btf_record_has_field(map->record, field_type)) { 8457 + verbose(env, "map '%s' has no valid %s\n", map->name, struct_name); 8455 8458 return -EINVAL; 8456 8459 } 8457 - if (map->record->timer_off != val + reg->off) { 8458 - verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", 8459 - val + reg->off, map->record->timer_off); 8460 + switch (field_type) { 8461 + case BPF_TIMER: 8462 + field_off = map->record->timer_off; 8463 + break; 8464 + case BPF_TASK_WORK: 8465 + field_off = map->record->task_work_off; 8466 + break; 8467 + default: 8468 + verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); 8460 8469 return -EINVAL; 8461 8470 } 8471 + if (field_off != val + reg->off) { 8472 + verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", 8473 + val + reg->off, struct_name, field_off); 8474 + return -EINVAL; 8475 + } 8476 + return 0; 8477 + } 8478 + 8479 + static int process_timer_func(struct bpf_verifier_env *env, int regno, 8480 + struct bpf_call_arg_meta *meta) 8481 + { 8482 + struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8483 + struct bpf_map *map = reg->map_ptr; 8484 + int err; 8485 + 8486 + err = check_map_field_pointer(env, regno, BPF_TIMER); 8487 + if (err) 8488 + return err; 8489 + 8462 8490 if (meta->map_ptr) { 8463 8491 verifier_bug(env, "Two map pointers in a timer helper"); 8464 8492 return -EFAULT; ··· 8511 8483 verbose(env, "off %lld doesn't point to 'struct bpf_wq' that is at %d\n", 8512 8484 val + reg->off, map->record->wq_off); 8513 8485 return -EINVAL; 8486 + } 8487 + meta->map.uid = reg->map_uid; 8488 + meta->map.ptr = map; 8489 + return 0; 8490 + } 8491 + 8492 + static int process_task_work_func(struct bpf_verifier_env *env, int regno, 8493 + struct bpf_kfunc_call_arg_meta *meta) 8494 + { 8495 + struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno]; 8496 + struct bpf_map *map = reg->map_ptr; 8497 + int err; 8498 + 8499 + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); 8500 + if (err) 8501 + return err; 8502 + 8503 + if (meta->map.ptr) { 8504 + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); 8505 + return -EFAULT; 8514 8506 } 8515 8507 meta->map.uid = reg->map_uid; 8516 8508 meta->map.ptr = map; ··· 10366 10318 struct bpf_func_state *callee, 10367 10319 int insn_idx); 10368 10320 10321 + static bool is_task_work_add_kfunc(u32 func_id); 10322 + 10369 10323 static int set_callee_state(struct bpf_verifier_env *env, 10370 10324 struct bpf_func_state *caller, 10371 10325 struct bpf_func_state *callee, int insn_idx); ··· 10586 10536 env->subprog_info[subprog].is_async_cb = true; 10587 10537 async_cb = push_async_cb(env, env->subprog_info[subprog].start, 10588 10538 insn_idx, subprog, 10589 - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); 10539 + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || 10540 + is_task_work_add_kfunc(insn->imm)); 10590 10541 if (!async_cb) 10591 10542 return -EFAULT; 10592 10543 callee = async_cb->frame[0]; ··· 10815 10764 __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); 10816 10765 __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 10817 10766 callee->in_async_callback_fn = true; 10818 - callee->callback_ret_range = retval_range(0, 1); 10767 + callee->callback_ret_range = retval_range(0, 0); 10819 10768 return 0; 10820 10769 } 10821 10770 ··· 10899 10848 __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 10900 10849 callee->in_callback_fn = true; 10901 10850 callee->callback_ret_range = retval_range(0, 1); 10851 + return 0; 10852 + } 10853 + 10854 + static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, 10855 + struct bpf_func_state *caller, 10856 + struct bpf_func_state *callee, 10857 + int insn_idx) 10858 + { 10859 + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; 10860 + 10861 + /* 10862 + * callback_fn(struct bpf_map *map, void *key, void *value); 10863 + */ 10864 + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; 10865 + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); 10866 + callee->regs[BPF_REG_1].map_ptr = map_ptr; 10867 + 10868 + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; 10869 + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); 10870 + callee->regs[BPF_REG_2].map_ptr = map_ptr; 10871 + 10872 + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; 10873 + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); 10874 + callee->regs[BPF_REG_3].map_ptr = map_ptr; 10875 + 10876 + /* unused */ 10877 + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); 10878 + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); 10879 + callee->in_async_callback_fn = true; 10880 + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); 10902 10881 return 0; 10903 10882 } 10904 10883 ··· 12056 11975 KF_ARG_RB_NODE_ID, 12057 11976 KF_ARG_WORKQUEUE_ID, 12058 11977 KF_ARG_RES_SPIN_LOCK_ID, 11978 + KF_ARG_TASK_WORK_ID, 12059 11979 }; 12060 11980 12061 11981 BTF_ID_LIST(kf_arg_btf_ids) ··· 12067 11985 BTF_ID(struct, bpf_rb_node) 12068 11986 BTF_ID(struct, bpf_wq) 12069 11987 BTF_ID(struct, bpf_res_spin_lock) 11988 + BTF_ID(struct, bpf_task_work) 12070 11989 12071 11990 static bool __is_kfunc_ptr_arg_type(const struct btf *btf, 12072 11991 const struct btf_param *arg, int type) ··· 12114 12031 static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) 12115 12032 { 12116 12033 return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); 12034 + } 12035 + 12036 + static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) 12037 + { 12038 + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); 12117 12039 } 12118 12040 12119 12041 static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) ··· 12208 12120 KF_ARG_PTR_TO_WORKQUEUE, 12209 12121 KF_ARG_PTR_TO_IRQ_FLAG, 12210 12122 KF_ARG_PTR_TO_RES_SPIN_LOCK, 12123 + KF_ARG_PTR_TO_TASK_WORK, 12211 12124 }; 12212 12125 12213 12126 enum special_kfunc_type { ··· 12258 12169 KF_bpf_res_spin_lock_irqsave, 12259 12170 KF_bpf_res_spin_unlock_irqrestore, 12260 12171 KF___bpf_trap, 12172 + KF_bpf_task_work_schedule_signal, 12173 + KF_bpf_task_work_schedule_resume, 12261 12174 }; 12262 12175 12263 12176 BTF_ID_LIST(special_kfunc_list) ··· 12328 12237 BTF_ID(func, bpf_res_spin_lock_irqsave) 12329 12238 BTF_ID(func, bpf_res_spin_unlock_irqrestore) 12330 12239 BTF_ID(func, __bpf_trap) 12240 + BTF_ID(func, bpf_task_work_schedule_signal) 12241 + BTF_ID(func, bpf_task_work_schedule_resume) 12242 + 12243 + static bool is_task_work_add_kfunc(u32 func_id) 12244 + { 12245 + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || 12246 + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; 12247 + } 12331 12248 12332 12249 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) 12333 12250 { ··· 12425 12326 12426 12327 if (is_kfunc_arg_wq(meta->btf, &args[argno])) 12427 12328 return KF_ARG_PTR_TO_WORKQUEUE; 12329 + 12330 + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) 12331 + return KF_ARG_PTR_TO_TASK_WORK; 12428 12332 12429 12333 if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) 12430 12334 return KF_ARG_PTR_TO_IRQ_FLAG; ··· 12772 12670 12773 12671 static bool is_async_callback_calling_kfunc(u32 btf_id) 12774 12672 { 12775 - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; 12673 + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || 12674 + is_task_work_add_kfunc(btf_id); 12776 12675 } 12777 12676 12778 12677 static bool is_bpf_throw_kfunc(struct bpf_insn *insn) ··· 13154 13051 verbose(env, "pointer in R%d isn't map pointer\n", regno); 13155 13052 return -EINVAL; 13156 13053 } 13157 - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { 13054 + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || 13055 + reg->map_ptr->record->task_work_off >= 0)) { 13158 13056 /* Use map_uid (which is unique id of inner map) to reject: 13159 13057 * inner_map1 = bpf_map_lookup_elem(outer_map, key1) 13160 13058 * inner_map2 = bpf_map_lookup_elem(outer_map, key2) ··· 13170 13066 */ 13171 13067 if (meta->map.ptr != reg->map_ptr || 13172 13068 meta->map.uid != reg->map_uid) { 13069 + if (reg->map_ptr->record->task_work_off >= 0) { 13070 + verbose(env, 13071 + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", 13072 + meta->map.uid, reg->map_uid); 13073 + return -EINVAL; 13074 + } 13173 13075 verbose(env, 13174 13076 "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", 13175 13077 meta->map.uid, reg->map_uid); ··· 13214 13104 case KF_ARG_PTR_TO_REFCOUNTED_KPTR: 13215 13105 case KF_ARG_PTR_TO_CONST_STR: 13216 13106 case KF_ARG_PTR_TO_WORKQUEUE: 13107 + case KF_ARG_PTR_TO_TASK_WORK: 13217 13108 case KF_ARG_PTR_TO_IRQ_FLAG: 13218 13109 case KF_ARG_PTR_TO_RES_SPIN_LOCK: 13219 13110 break; ··· 13505 13394 return -EINVAL; 13506 13395 } 13507 13396 ret = process_wq_func(env, regno, meta); 13397 + if (ret < 0) 13398 + return ret; 13399 + break; 13400 + case KF_ARG_PTR_TO_TASK_WORK: 13401 + if (reg->type != PTR_TO_MAP_VALUE) { 13402 + verbose(env, "arg#%d doesn't point to a map value\n", i); 13403 + return -EINVAL; 13404 + } 13405 + ret = process_task_work_func(env, regno, meta); 13508 13406 if (ret < 0) 13509 13407 return ret; 13510 13408 break; ··· 13876 13756 if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { 13877 13757 err = push_callback_call(env, insn, insn_idx, meta.subprogno, 13878 13758 set_timer_callback_state); 13759 + if (err) { 13760 + verbose(env, "kfunc %s#%d failed callback verification\n", 13761 + func_name, meta.func_id); 13762 + return err; 13763 + } 13764 + } 13765 + 13766 + if (is_task_work_add_kfunc(meta.func_id)) { 13767 + err = push_callback_call(env, insn, insn_idx, meta.subprogno, 13768 + set_task_work_schedule_callback_state); 13879 13769 if (err) { 13880 13770 verbose(env, "kfunc %s#%d failed callback verification\n", 13881 13771 func_name, meta.func_id); ··· 17178 17048 } 17179 17049 17180 17050 if (frame->in_async_callback_fn) { 17181 - /* enforce return zero from async callbacks like timer */ 17182 17051 exit_ctx = "At async callback return"; 17183 - range = retval_range(0, 0); 17052 + range = frame->callback_ret_range; 17184 17053 goto enforce_retval; 17185 17054 } 17186 17055

+4

tools/include/uapi/linux/bpf.h

··· 7436 7436 __u64 __opaque[2]; 7437 7437 } __attribute__((aligned(8))); 7438 7438 7439 + struct bpf_task_work { 7440 + __u64 __opaque; 7441 + } __attribute__((aligned(8))); 7442 + 7439 7443 struct bpf_wq { 7440 7444 __u64 __opaque[2]; 7441 7445 } __attribute__((aligned(8)));

+130

tools/testing/selftests/bpf/prog_tests/task_work_stress.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <string.h> 5 + #include <stdio.h> 6 + #include "task_work_stress.skel.h" 7 + #include <linux/bpf.h> 8 + #include <linux/perf_event.h> 9 + #include <sys/syscall.h> 10 + #include <time.h> 11 + #include <stdlib.h> 12 + #include <stdatomic.h> 13 + 14 + struct test_data { 15 + int prog_fd; 16 + atomic_int exit; 17 + }; 18 + 19 + void *runner(void *test_data) 20 + { 21 + struct test_data *td = test_data; 22 + int err = 0; 23 + LIBBPF_OPTS(bpf_test_run_opts, opts); 24 + 25 + while (!err && !atomic_load(&td->exit)) 26 + err = bpf_prog_test_run_opts(td->prog_fd, &opts); 27 + 28 + return NULL; 29 + } 30 + 31 + static int get_env_int(const char *str, int def) 32 + { 33 + const char *s = getenv(str); 34 + char *end; 35 + int retval; 36 + 37 + if (!s || !*s) 38 + return def; 39 + errno = 0; 40 + retval = strtol(s, &end, 10); 41 + if (errno || *end || retval < 0) 42 + return def; 43 + return retval; 44 + } 45 + 46 + static void task_work_run(bool enable_delete) 47 + { 48 + struct task_work_stress *skel; 49 + struct bpf_program *scheduler, *deleter; 50 + int nthreads = 16; 51 + int test_time_s = get_env_int("BPF_TASK_WORK_TEST_TIME", 1); 52 + pthread_t tid[nthreads], tid_del; 53 + bool started[nthreads], started_del = false; 54 + struct test_data td_sched = { .exit = 0 }, td_del = { .exit = 1 }; 55 + int i, err; 56 + 57 + skel = task_work_stress__open(); 58 + if (!ASSERT_OK_PTR(skel, "task_work__open")) 59 + return; 60 + 61 + scheduler = bpf_object__find_program_by_name(skel->obj, "schedule_task_work"); 62 + bpf_program__set_autoload(scheduler, true); 63 + 64 + deleter = bpf_object__find_program_by_name(skel->obj, "delete_task_work"); 65 + bpf_program__set_autoload(deleter, true); 66 + 67 + err = task_work_stress__load(skel); 68 + if (!ASSERT_OK(err, "skel_load")) 69 + goto cleanup; 70 + 71 + for (i = 0; i < nthreads; ++i) 72 + started[i] = false; 73 + 74 + td_sched.prog_fd = bpf_program__fd(scheduler); 75 + for (i = 0; i < nthreads; ++i) { 76 + if (pthread_create(&tid[i], NULL, runner, &td_sched) != 0) { 77 + fprintf(stderr, "could not start thread"); 78 + goto cancel; 79 + } 80 + started[i] = true; 81 + } 82 + 83 + if (enable_delete) 84 + atomic_store(&td_del.exit, 0); 85 + 86 + td_del.prog_fd = bpf_program__fd(deleter); 87 + if (pthread_create(&tid_del, NULL, runner, &td_del) != 0) { 88 + fprintf(stderr, "could not start thread"); 89 + goto cancel; 90 + } 91 + started_del = true; 92 + 93 + /* Run stress test for some time */ 94 + sleep(test_time_s); 95 + 96 + cancel: 97 + atomic_store(&td_sched.exit, 1); 98 + atomic_store(&td_del.exit, 1); 99 + for (i = 0; i < nthreads; ++i) { 100 + if (started[i]) 101 + pthread_join(tid[i], NULL); 102 + } 103 + 104 + if (started_del) 105 + pthread_join(tid_del, NULL); 106 + 107 + ASSERT_GT(skel->bss->callback_scheduled, 0, "work scheduled"); 108 + /* Some scheduling attempts should have failed due to contention */ 109 + ASSERT_GT(skel->bss->schedule_error, 0, "schedule error"); 110 + 111 + if (enable_delete) { 112 + /* If delete thread is enabled, it has cancelled some callbacks */ 113 + ASSERT_GT(skel->bss->delete_success, 0, "delete success"); 114 + ASSERT_LT(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); 115 + } else { 116 + /* Without delete thread number of scheduled callbacks is the same as fired */ 117 + ASSERT_EQ(skel->bss->callback_success, skel->bss->callback_scheduled, "callbacks"); 118 + } 119 + 120 + cleanup: 121 + task_work_stress__destroy(skel); 122 + } 123 + 124 + void test_task_work_stress(void) 125 + { 126 + if (test__start_subtest("no_delete")) 127 + task_work_run(false); 128 + if (test__start_subtest("with_delete")) 129 + task_work_run(true); 130 + }

+150

tools/testing/selftests/bpf/prog_tests/test_task_work.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + #include <test_progs.h> 4 + #include <string.h> 5 + #include <stdio.h> 6 + #include "task_work.skel.h" 7 + #include "task_work_fail.skel.h" 8 + #include <linux/bpf.h> 9 + #include <linux/perf_event.h> 10 + #include <sys/syscall.h> 11 + #include <time.h> 12 + 13 + static int perf_event_open(__u32 type, __u64 config, int pid) 14 + { 15 + struct perf_event_attr attr = { 16 + .type = type, 17 + .config = config, 18 + .size = sizeof(struct perf_event_attr), 19 + .sample_period = 100000, 20 + }; 21 + 22 + return syscall(__NR_perf_event_open, &attr, pid, -1, -1, 0); 23 + } 24 + 25 + struct elem { 26 + char data[128]; 27 + struct bpf_task_work tw; 28 + }; 29 + 30 + static int verify_map(struct bpf_map *map, const char *expected_data) 31 + { 32 + int err; 33 + struct elem value; 34 + int processed_values = 0; 35 + int k, sz; 36 + 37 + sz = bpf_map__max_entries(map); 38 + for (k = 0; k < sz; ++k) { 39 + err = bpf_map__lookup_elem(map, &k, sizeof(int), &value, sizeof(struct elem), 0); 40 + if (err) 41 + continue; 42 + if (!ASSERT_EQ(strcmp(expected_data, value.data), 0, "map data")) { 43 + fprintf(stderr, "expected '%s', found '%s' in %s map", expected_data, 44 + value.data, bpf_map__name(map)); 45 + return 2; 46 + } 47 + processed_values++; 48 + } 49 + 50 + return processed_values == 0; 51 + } 52 + 53 + static void task_work_run(const char *prog_name, const char *map_name) 54 + { 55 + struct task_work *skel; 56 + struct bpf_program *prog; 57 + struct bpf_map *map; 58 + struct bpf_link *link; 59 + int err, pe_fd = 0, pid, status, pipefd[2]; 60 + char user_string[] = "hello world"; 61 + 62 + if (!ASSERT_NEQ(pipe(pipefd), -1, "pipe")) 63 + return; 64 + 65 + pid = fork(); 66 + if (pid == 0) { 67 + __u64 num = 1; 68 + int i; 69 + char buf; 70 + 71 + close(pipefd[1]); 72 + read(pipefd[0], &buf, sizeof(buf)); 73 + close(pipefd[0]); 74 + 75 + for (i = 0; i < 10000; ++i) 76 + num *= time(0) % 7; 77 + (void)num; 78 + exit(0); 79 + } 80 + ASSERT_GT(pid, 0, "fork() failed"); 81 + 82 + skel = task_work__open(); 83 + if (!ASSERT_OK_PTR(skel, "task_work__open")) 84 + return; 85 + 86 + bpf_object__for_each_program(prog, skel->obj) { 87 + bpf_program__set_autoload(prog, false); 88 + } 89 + 90 + prog = bpf_object__find_program_by_name(skel->obj, prog_name); 91 + if (!ASSERT_OK_PTR(prog, "prog_name")) 92 + goto cleanup; 93 + bpf_program__set_autoload(prog, true); 94 + skel->bss->user_ptr = (char *)user_string; 95 + 96 + err = task_work__load(skel); 97 + if (!ASSERT_OK(err, "skel_load")) 98 + goto cleanup; 99 + 100 + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, pid); 101 + if (pe_fd == -1 && (errno == ENOENT || errno == EOPNOTSUPP)) { 102 + printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__); 103 + test__skip(); 104 + goto cleanup; 105 + } 106 + if (!ASSERT_NEQ(pe_fd, -1, "pe_fd")) { 107 + fprintf(stderr, "perf_event_open errno: %d, pid: %d\n", errno, pid); 108 + goto cleanup; 109 + } 110 + 111 + link = bpf_program__attach_perf_event(prog, pe_fd); 112 + if (!ASSERT_OK_PTR(link, "attach_perf_event")) 113 + goto cleanup; 114 + 115 + close(pipefd[0]); 116 + write(pipefd[1], user_string, 1); 117 + close(pipefd[1]); 118 + /* Wait to collect some samples */ 119 + waitpid(pid, &status, 0); 120 + pid = 0; 121 + map = bpf_object__find_map_by_name(skel->obj, map_name); 122 + if (!ASSERT_OK_PTR(map, "find map_name")) 123 + goto cleanup; 124 + if (!ASSERT_OK(verify_map(map, user_string), "verify map")) 125 + goto cleanup; 126 + cleanup: 127 + if (pe_fd >= 0) 128 + close(pe_fd); 129 + task_work__destroy(skel); 130 + if (pid) { 131 + close(pipefd[0]); 132 + write(pipefd[1], user_string, 1); 133 + close(pipefd[1]); 134 + waitpid(pid, &status, 0); 135 + } 136 + } 137 + 138 + void test_task_work(void) 139 + { 140 + if (test__start_subtest("test_task_work_hash_map")) 141 + task_work_run("oncpu_hash_map", "hmap"); 142 + 143 + if (test__start_subtest("test_task_work_array_map")) 144 + task_work_run("oncpu_array_map", "arrmap"); 145 + 146 + if (test__start_subtest("test_task_work_lru_map")) 147 + task_work_run("oncpu_lru_map", "lrumap"); 148 + 149 + RUN_TESTS(task_work_fail); 150 + }

+107

tools/testing/selftests/bpf/progs/task_work.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + 4 + #include <vmlinux.h> 5 + #include <string.h> 6 + #include <stdbool.h> 7 + #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 9 + #include "bpf_misc.h" 10 + #include "errno.h" 11 + 12 + char _license[] SEC("license") = "GPL"; 13 + 14 + const void *user_ptr = NULL; 15 + 16 + struct elem { 17 + char data[128]; 18 + struct bpf_task_work tw; 19 + }; 20 + 21 + struct { 22 + __uint(type, BPF_MAP_TYPE_HASH); 23 + __uint(map_flags, BPF_F_NO_PREALLOC); 24 + __uint(max_entries, 1); 25 + __type(key, int); 26 + __type(value, struct elem); 27 + } hmap SEC(".maps"); 28 + 29 + struct { 30 + __uint(type, BPF_MAP_TYPE_ARRAY); 31 + __uint(max_entries, 1); 32 + __type(key, int); 33 + __type(value, struct elem); 34 + } arrmap SEC(".maps"); 35 + 36 + struct { 37 + __uint(type, BPF_MAP_TYPE_LRU_HASH); 38 + __uint(max_entries, 1); 39 + __type(key, int); 40 + __type(value, struct elem); 41 + } lrumap SEC(".maps"); 42 + 43 + static int process_work(struct bpf_map *map, void *key, void *value) 44 + { 45 + struct elem *work = value; 46 + 47 + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); 48 + return 0; 49 + } 50 + 51 + int key = 0; 52 + 53 + SEC("perf_event") 54 + int oncpu_hash_map(struct pt_regs *args) 55 + { 56 + struct elem empty_work = { .data = { 0 } }; 57 + struct elem *work; 58 + struct task_struct *task; 59 + int err; 60 + 61 + task = bpf_get_current_task_btf(); 62 + err = bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); 63 + if (err) 64 + return 0; 65 + work = bpf_map_lookup_elem(&hmap, &key); 66 + if (!work) 67 + return 0; 68 + 69 + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); 70 + return 0; 71 + } 72 + 73 + SEC("perf_event") 74 + int oncpu_array_map(struct pt_regs *args) 75 + { 76 + struct elem *work; 77 + struct task_struct *task; 78 + 79 + task = bpf_get_current_task_btf(); 80 + work = bpf_map_lookup_elem(&arrmap, &key); 81 + if (!work) 82 + return 0; 83 + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); 84 + return 0; 85 + } 86 + 87 + SEC("perf_event") 88 + int oncpu_lru_map(struct pt_regs *args) 89 + { 90 + struct elem empty_work = { .data = { 0 } }; 91 + struct elem *work; 92 + struct task_struct *task; 93 + int err; 94 + 95 + task = bpf_get_current_task_btf(); 96 + work = bpf_map_lookup_elem(&lrumap, &key); 97 + if (work) 98 + return 0; 99 + err = bpf_map_update_elem(&lrumap, &key, &empty_work, BPF_NOEXIST); 100 + if (err) 101 + return 0; 102 + work = bpf_map_lookup_elem(&lrumap, &key); 103 + if (!work || work->data[0]) 104 + return 0; 105 + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); 106 + return 0; 107 + }

+96

tools/testing/selftests/bpf/progs/task_work_fail.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + 4 + #include <vmlinux.h> 5 + #include <string.h> 6 + #include <stdbool.h> 7 + #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 9 + #include "bpf_misc.h" 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + const void *user_ptr = NULL; 14 + 15 + struct elem { 16 + char data[128]; 17 + struct bpf_task_work tw; 18 + }; 19 + 20 + struct { 21 + __uint(type, BPF_MAP_TYPE_HASH); 22 + __uint(map_flags, BPF_F_NO_PREALLOC); 23 + __uint(max_entries, 1); 24 + __type(key, int); 25 + __type(value, struct elem); 26 + } hmap SEC(".maps"); 27 + 28 + struct { 29 + __uint(type, BPF_MAP_TYPE_ARRAY); 30 + __uint(max_entries, 1); 31 + __type(key, int); 32 + __type(value, struct elem); 33 + } arrmap SEC(".maps"); 34 + 35 + static int process_work(struct bpf_map *map, void *key, void *value) 36 + { 37 + struct elem *work = value; 38 + 39 + bpf_copy_from_user_str(work->data, sizeof(work->data), (const void *)user_ptr, 0); 40 + return 0; 41 + } 42 + 43 + int key = 0; 44 + 45 + SEC("perf_event") 46 + __failure __msg("doesn't match map pointer in R3") 47 + int mismatch_map(struct pt_regs *args) 48 + { 49 + struct elem *work; 50 + struct task_struct *task; 51 + 52 + task = bpf_get_current_task_btf(); 53 + work = bpf_map_lookup_elem(&arrmap, &key); 54 + if (!work) 55 + return 0; 56 + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); 57 + return 0; 58 + } 59 + 60 + SEC("perf_event") 61 + __failure __msg("arg#1 doesn't point to a map value") 62 + int no_map_task_work(struct pt_regs *args) 63 + { 64 + struct task_struct *task; 65 + struct bpf_task_work tw; 66 + 67 + task = bpf_get_current_task_btf(); 68 + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); 69 + return 0; 70 + } 71 + 72 + SEC("perf_event") 73 + __failure __msg("Possibly NULL pointer passed to trusted arg1") 74 + int task_work_null(struct pt_regs *args) 75 + { 76 + struct task_struct *task; 77 + 78 + task = bpf_get_current_task_btf(); 79 + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); 80 + return 0; 81 + } 82 + 83 + SEC("perf_event") 84 + __failure __msg("Possibly NULL pointer passed to trusted arg2") 85 + int map_null(struct pt_regs *args) 86 + { 87 + struct elem *work; 88 + struct task_struct *task; 89 + 90 + task = bpf_get_current_task_btf(); 91 + work = bpf_map_lookup_elem(&arrmap, &key); 92 + if (!work) 93 + return 0; 94 + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); 95 + return 0; 96 + }

+73

tools/testing/selftests/bpf/progs/task_work_stress.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ 3 + 4 + #include <vmlinux.h> 5 + #include <string.h> 6 + #include <stdbool.h> 7 + #include <bpf/bpf_helpers.h> 8 + #include <bpf/bpf_tracing.h> 9 + #include "bpf_misc.h" 10 + 11 + #define ENTRIES 128 12 + 13 + char _license[] SEC("license") = "GPL"; 14 + 15 + __u64 callback_scheduled = 0; 16 + __u64 callback_success = 0; 17 + __u64 schedule_error = 0; 18 + __u64 delete_success = 0; 19 + 20 + struct elem { 21 + __u32 count; 22 + struct bpf_task_work tw; 23 + }; 24 + 25 + struct { 26 + __uint(type, BPF_MAP_TYPE_HASH); 27 + __uint(map_flags, BPF_F_NO_PREALLOC); 28 + __uint(max_entries, ENTRIES); 29 + __type(key, int); 30 + __type(value, struct elem); 31 + } hmap SEC(".maps"); 32 + 33 + static int process_work(struct bpf_map *map, void *key, void *value) 34 + { 35 + __sync_fetch_and_add(&callback_success, 1); 36 + return 0; 37 + } 38 + 39 + SEC("syscall") 40 + int schedule_task_work(void *ctx) 41 + { 42 + struct elem empty_work = {.count = 0}; 43 + struct elem *work; 44 + int key = 0, err; 45 + 46 + key = bpf_ktime_get_ns() % ENTRIES; 47 + work = bpf_map_lookup_elem(&hmap, &key); 48 + if (!work) { 49 + bpf_map_update_elem(&hmap, &key, &empty_work, BPF_NOEXIST); 50 + work = bpf_map_lookup_elem(&hmap, &key); 51 + if (!work) 52 + return 0; 53 + } 54 + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, 55 + process_work, NULL); 56 + if (err) 57 + __sync_fetch_and_add(&schedule_error, 1); 58 + else 59 + __sync_fetch_and_add(&callback_scheduled, 1); 60 + return 0; 61 + } 62 + 63 + SEC("syscall") 64 + int delete_task_work(void *ctx) 65 + { 66 + int key = 0, err; 67 + 68 + key = bpf_get_prandom_u32() % ENTRIES; 69 + err = bpf_map_delete_elem(&hmap, &key); 70 + if (!err) 71 + __sync_fetch_and_add(&delete_success, 1); 72 + return 0; 73 + }

Configure Feed

Configure Feed