bpf: Register dtor for freeing special fields

There is a race window where BPF hash map elements can leak special
fields if the program with access to the map value recreates these
special fields between the check_and_free_fields done on the map value
and its eventual return to the memory allocator.

Several ways were explored prior to this patch, most notably [0] tried
to use a poison value to reject attempts to recreate special fields for
map values that have been logically deleted but still accessible to BPF
programs (either while sitting in the free list or when reused). While
this approach works well for task work, timers, wq, etc., it is harder
to apply the idea to kptrs, which have a similar race and failure mode.

Instead, we change bpf_mem_alloc to allow registering destructor for
allocated elements, such that when they are returned to the allocator,
any special fields created while they were accessible to programs in the
mean time will be freed. If these values get reused, we do not free the
fields again before handing the element back. The special fields thus
may remain initialized while the map value sits in a free list.

When bpf_mem_alloc is retired in the future, a similar concept can be
introduced to kmalloc_nolock-backed kmem_cache, paired with the existing
idea of a constructor.

Note that the destructor registration happens in map_check_btf, after
the BTF record is populated and (at that point) avaiable for inspection
and duplication. Duplication is necessary since the freeing of embedded
bpf_mem_alloc can be decoupled from actual map lifetime due to logic
introduced to reduce the cost of rcu_barrier()s in mem alloc free path in
9f2c6e96c65e ("bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc.").

As such, once all callbacks are done, we must also free the duplicated
record. To remove dependency on the bpf_map itself, also stash the key
size of the map to obtain value from htab_elem long after the map is
gone.

[0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com

Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr")
Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20260227224806.646888-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Kumar Kartikeya Dwivedi and committed by

Alexei Starovoitov 2 months ago 1df97a74 6881af27

+140 -11

3 changed files

expand all

include

linux

bpf_mem_alloc.h

kernel

bpf

hashtab.c

memalloc.c

include/linux/bpf_mem_alloc.h

··· 14 14 struct obj_cgroup *objcg; 15 15 bool percpu; 16 16 struct work_struct work; 17 + void (*dtor_ctx_free)(void *ctx); 18 + void *dtor_ctx; 17 19 }; 18 20 19 21 /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. ··· 34 32 /* The percpu allocation with a specific unit size. */ 35 33 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); 36 34 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); 35 + void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, 36 + void (*dtor)(void *obj, void *ctx), 37 + void (*dtor_ctx_free)(void *ctx), 38 + void *ctx); 37 39 38 40 /* Check the allocation size for kmalloc equivalent allocator */ 39 41 int bpf_mem_alloc_check_size(bool percpu, size_t size);

+87

kernel/bpf/hashtab.c

··· 125 125 char key[] __aligned(8); 126 126 }; 127 127 128 + struct htab_btf_record { 129 + struct btf_record *record; 130 + u32 key_size; 131 + }; 132 + 128 133 static inline bool htab_is_prealloc(const struct bpf_htab *htab) 129 134 { 130 135 return !(htab->map.map_flags & BPF_F_NO_PREALLOC); ··· 460 455 return -E2BIG; 461 456 462 457 return 0; 458 + } 459 + 460 + static void htab_mem_dtor(void *obj, void *ctx) 461 + { 462 + struct htab_btf_record *hrec = ctx; 463 + struct htab_elem *elem = obj; 464 + void *map_value; 465 + 466 + if (IS_ERR_OR_NULL(hrec->record)) 467 + return; 468 + 469 + map_value = htab_elem_value(elem, hrec->key_size); 470 + bpf_obj_free_fields(hrec->record, map_value); 471 + } 472 + 473 + static void htab_pcpu_mem_dtor(void *obj, void *ctx) 474 + { 475 + void __percpu *pptr = *(void __percpu **)obj; 476 + struct htab_btf_record *hrec = ctx; 477 + int cpu; 478 + 479 + if (IS_ERR_OR_NULL(hrec->record)) 480 + return; 481 + 482 + for_each_possible_cpu(cpu) 483 + bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu)); 484 + } 485 + 486 + static void htab_dtor_ctx_free(void *ctx) 487 + { 488 + struct htab_btf_record *hrec = ctx; 489 + 490 + btf_record_free(hrec->record); 491 + kfree(ctx); 492 + } 493 + 494 + static int htab_set_dtor(const struct bpf_htab *htab, void (*dtor)(void *, void *)) 495 + { 496 + u32 key_size = htab->map.key_size; 497 + const struct bpf_mem_alloc *ma; 498 + struct htab_btf_record *hrec; 499 + int err; 500 + 501 + /* No need for dtors. */ 502 + if (IS_ERR_OR_NULL(htab->map.record)) 503 + return 0; 504 + 505 + hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); 506 + if (!hrec) 507 + return -ENOMEM; 508 + hrec->key_size = key_size; 509 + hrec->record = btf_record_dup(htab->map.record); 510 + if (IS_ERR(hrec->record)) { 511 + err = PTR_ERR(hrec->record); 512 + kfree(hrec); 513 + return err; 514 + } 515 + ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; 516 + /* Kinda sad, but cast away const-ness since we change ma->dtor. */ 517 + bpf_mem_alloc_set_dtor((struct bpf_mem_alloc *)ma, dtor, htab_dtor_ctx_free, hrec); 518 + return 0; 519 + } 520 + 521 + static int htab_map_check_btf(const struct bpf_map *map, const struct btf *btf, 522 + const struct btf_type *key_type, const struct btf_type *value_type) 523 + { 524 + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 525 + 526 + if (htab_is_prealloc(htab)) 527 + return 0; 528 + /* 529 + * We must set the dtor using this callback, as map's BTF record is not 530 + * populated in htab_map_alloc(), so it will always appear as NULL. 531 + */ 532 + if (htab_is_percpu(htab)) 533 + return htab_set_dtor(htab, htab_pcpu_mem_dtor); 534 + else 535 + return htab_set_dtor(htab, htab_mem_dtor); 463 536 } 464 537 465 538 static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ··· 2364 2281 .map_seq_show_elem = htab_map_seq_show_elem, 2365 2282 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2366 2283 .map_for_each_callback = bpf_for_each_hash_elem, 2284 + .map_check_btf = htab_map_check_btf, 2367 2285 .map_mem_usage = htab_map_mem_usage, 2368 2286 BATCH_OPS(htab), 2369 2287 .map_btf_id = &htab_map_btf_ids[0], ··· 2387 2303 .map_seq_show_elem = htab_map_seq_show_elem, 2388 2304 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2389 2305 .map_for_each_callback = bpf_for_each_hash_elem, 2306 + .map_check_btf = htab_map_check_btf, 2390 2307 .map_mem_usage = htab_map_mem_usage, 2391 2308 BATCH_OPS(htab_lru), 2392 2309 .map_btf_id = &htab_map_btf_ids[0], ··· 2567 2482 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2568 2483 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2569 2484 .map_for_each_callback = bpf_for_each_hash_elem, 2485 + .map_check_btf = htab_map_check_btf, 2570 2486 .map_mem_usage = htab_map_mem_usage, 2571 2487 BATCH_OPS(htab_percpu), 2572 2488 .map_btf_id = &htab_map_btf_ids[0], ··· 2588 2502 .map_seq_show_elem = htab_percpu_map_seq_show_elem, 2589 2503 .map_set_for_each_callback_args = map_set_for_each_callback_args, 2590 2504 .map_for_each_callback = bpf_for_each_hash_elem, 2505 + .map_check_btf = htab_map_check_btf, 2591 2506 .map_mem_usage = htab_map_mem_usage, 2592 2507 BATCH_OPS(htab_lru_percpu), 2593 2508 .map_btf_id = &htab_map_btf_ids[0],

+47 -11

kernel/bpf/memalloc.c

··· 102 102 int percpu_size; 103 103 bool draining; 104 104 struct bpf_mem_cache *tgt; 105 + void (*dtor)(void *obj, void *ctx); 106 + void *dtor_ctx; 105 107 106 108 /* list of objects to be freed after RCU GP */ 107 109 struct llist_head free_by_rcu; ··· 262 260 kfree(obj); 263 261 } 264 262 265 - static int free_all(struct llist_node *llnode, bool percpu) 263 + static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) 266 264 { 267 265 struct llist_node *pos, *t; 268 266 int cnt = 0; 269 267 270 268 llist_for_each_safe(pos, t, llnode) { 269 + if (c->dtor) 270 + c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); 271 271 free_one(pos, percpu); 272 272 cnt++; 273 273 } ··· 280 276 { 281 277 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 282 278 283 - free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 279 + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 284 280 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 285 281 } 286 282 ··· 312 308 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 313 309 if (unlikely(READ_ONCE(c->draining))) { 314 310 llnode = llist_del_all(&c->free_by_rcu_ttrace); 315 - free_all(llnode, !!c->percpu_size); 311 + free_all(c, llnode, !!c->percpu_size); 316 312 } 317 313 return; 318 314 } ··· 421 417 dec_active(c, &flags); 422 418 423 419 if (unlikely(READ_ONCE(c->draining))) { 424 - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 420 + free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 425 421 atomic_set(&c->call_rcu_in_progress, 0); 426 422 } else { 427 423 call_rcu_hurry(&c->rcu, __free_by_rcu); ··· 639 635 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 640 636 * on these lists, so it is safe to use __llist_del_all(). 641 637 */ 642 - free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 643 - free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 644 - free_all(__llist_del_all(&c->free_llist), percpu); 645 - free_all(__llist_del_all(&c->free_llist_extra), percpu); 646 - free_all(__llist_del_all(&c->free_by_rcu), percpu); 647 - free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 648 - free_all(llist_del_all(&c->waiting_for_gp), percpu); 638 + free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); 639 + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); 640 + free_all(c, __llist_del_all(&c->free_llist), percpu); 641 + free_all(c, __llist_del_all(&c->free_llist_extra), percpu); 642 + free_all(c, __llist_del_all(&c->free_by_rcu), percpu); 643 + free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); 644 + free_all(c, llist_del_all(&c->waiting_for_gp), percpu); 649 645 } 650 646 651 647 static void check_mem_cache(struct bpf_mem_cache *c) ··· 684 680 685 681 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 686 682 { 683 + /* We can free dtor ctx only once all callbacks are done using it. */ 684 + if (ma->dtor_ctx_free) 685 + ma->dtor_ctx_free(ma->dtor_ctx); 687 686 check_leaked_objs(ma); 688 687 free_percpu(ma->cache); 689 688 free_percpu(ma->caches); ··· 1020 1013 return -E2BIG; 1021 1014 1022 1015 return 0; 1016 + } 1017 + 1018 + void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), 1019 + void (*dtor_ctx_free)(void *ctx), void *ctx) 1020 + { 1021 + struct bpf_mem_caches *cc; 1022 + struct bpf_mem_cache *c; 1023 + int cpu, i; 1024 + 1025 + ma->dtor_ctx_free = dtor_ctx_free; 1026 + ma->dtor_ctx = ctx; 1027 + 1028 + if (ma->cache) { 1029 + for_each_possible_cpu(cpu) { 1030 + c = per_cpu_ptr(ma->cache, cpu); 1031 + c->dtor = dtor; 1032 + c->dtor_ctx = ctx; 1033 + } 1034 + } 1035 + if (ma->caches) { 1036 + for_each_possible_cpu(cpu) { 1037 + cc = per_cpu_ptr(ma->caches, cpu); 1038 + for (i = 0; i < NUM_CACHES; i++) { 1039 + c = &cc->cache[i]; 1040 + c->dtor = dtor; 1041 + c->dtor_ctx = ctx; 1042 + } 1043 + } 1044 + } 1023 1045 }

Configure Feed

Configure Feed