Merge branch 'bpf-migrate-bpf_task_work-and-file-dynptr-to-kmalloc_nolock'

Mykyta Yatsenko says:

====================
bpf: Migrate bpf_task_work and file dynptr to kmalloc_nolock

Now that kmalloc can be used from NMI context via kmalloc_nolock(),
migrate BPF internal allocations away from bpf_mem_alloc to use the
standard slab allocator.

Use kfree_rcu() for deferred freeing, which waits for a regular RCU
grace period before the memory is reclaimed. Sleepable BPF programs
hold rcu_read_lock_trace but not regular rcu_read_lock, so patch 1
adds explicit rcu_read_lock/unlock around the pointer-to-refcount
window to prevent kfree_rcu from freeing memory while a sleepable
program is still between reading the pointer and acquiring a
reference.

Patch 1 migrates bpf_task_work_ctx from bpf_mem_alloc/bpf_mem_free to
kmalloc_nolock/kfree_rcu.

Patch 2 migrates bpf_dynptr_file_impl from bpf_mem_alloc/bpf_mem_free
to kmalloc_nolock/kfree.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
---
Changes in v2:
- Switch to scoped_guard in patch 1 (Kumar)
- Remove rcu gp wait in patch 2 (Kumar)
- Defer to irq_work when irqs disabled in patch 1
- use bpf_map_kmalloc_nolock() for bpf_task_work
- use kmalloc_nolock() for file dynptr
- Link to v1: https://lore.kernel.org/all/20260325-kmalloc_special-v1-0-269666afb1ea@meta.com/
====================

Link: https://patch.msgid.link/20260330-kmalloc_special-v2-0-c90403f92ff0@meta.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Alexei Starovoitov 2 months ago e25cfbec f7601044

+38 -20

1 changed file

expand all

kernel

bpf

helpers.c

+38 -20

kernel/bpf/helpers.c

··· 4295 4295 return refcount_inc_not_zero(&ctx->refcnt); 4296 4296 } 4297 4297 4298 + static void bpf_task_work_destroy(struct irq_work *irq_work) 4299 + { 4300 + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); 4301 + 4302 + bpf_task_work_ctx_reset(ctx); 4303 + kfree_rcu(ctx, rcu); 4304 + } 4305 + 4298 4306 static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) 4299 4307 { 4300 4308 if (!refcount_dec_and_test(&ctx->refcnt)) 4301 4309 return; 4302 4310 4303 - bpf_task_work_ctx_reset(ctx); 4304 - 4305 - /* bpf_mem_free expects migration to be disabled */ 4306 - migrate_disable(); 4307 - bpf_mem_free(&bpf_global_ma, ctx); 4308 - migrate_enable(); 4311 + if (irqs_disabled()) { 4312 + ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy); 4313 + irq_work_queue(&ctx->irq_work); 4314 + } else { 4315 + bpf_task_work_destroy(&ctx->irq_work); 4316 + } 4309 4317 } 4310 4318 4311 4319 static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) ··· 4367 4359 enum bpf_task_work_state state; 4368 4360 int err; 4369 4361 4370 - guard(rcu_tasks_trace)(); 4362 + guard(rcu)(); 4371 4363 4372 4364 if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { 4373 4365 bpf_task_work_ctx_put(ctx); ··· 4389 4381 /* 4390 4382 * It's technically possible for just scheduled task_work callback to 4391 4383 * complete running by now, going SCHEDULING -> RUNNING and then 4392 - * dropping its ctx refcount. Instead of capturing extra ref just to 4393 - * protected below ctx->state access, we rely on RCU protection to 4394 - * perform below SCHEDULING -> SCHEDULED attempt. 4384 + * dropping its ctx refcount. Instead of capturing an extra ref just 4385 + * to protect below ctx->state access, we rely on rcu_read_lock 4386 + * above to prevent kfree_rcu from freeing ctx before we return. 4395 4387 */ 4396 4388 state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); 4397 4389 if (state == BPF_TW_FREED) ··· 4408 4400 if (ctx) 4409 4401 return ctx; 4410 4402 4411 - ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); 4403 + ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE); 4412 4404 if (!ctx) 4413 4405 return ERR_PTR(-ENOMEM); 4414 4406 ··· 4422 4414 * tw->ctx is set by concurrent BPF program, release allocated 4423 4415 * memory and try to reuse already set context. 4424 4416 */ 4425 - bpf_mem_free(&bpf_global_ma, ctx); 4417 + kfree_nolock(ctx); 4426 4418 return old_ctx; 4427 4419 } 4428 4420 ··· 4434 4426 { 4435 4427 struct bpf_task_work_ctx *ctx; 4436 4428 4437 - ctx = bpf_task_work_fetch_ctx(tw, map); 4438 - if (IS_ERR(ctx)) 4439 - return ctx; 4429 + /* 4430 + * Sleepable BPF programs hold rcu_read_lock_trace but not 4431 + * regular rcu_read_lock. Since kfree_rcu waits for regular 4432 + * RCU GP, the ctx can be freed while we're between reading 4433 + * the pointer and incrementing the refcount. Take regular 4434 + * rcu_read_lock to prevent kfree_rcu from freeing the ctx 4435 + * before we can tryget it. 4436 + */ 4437 + scoped_guard(rcu) { 4438 + ctx = bpf_task_work_fetch_ctx(tw, map); 4439 + if (IS_ERR(ctx)) 4440 + return ctx; 4440 4441 4441 - /* try to get ref for task_work callback to hold */ 4442 - if (!bpf_task_work_ctx_tryget(ctx)) 4443 - return ERR_PTR(-EBUSY); 4442 + /* try to get ref for task_work callback to hold */ 4443 + if (!bpf_task_work_ctx_tryget(ctx)) 4444 + return ERR_PTR(-EBUSY); 4445 + } 4444 4446 4445 4447 if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { 4446 4448 /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ ··· 4565 4547 return -EINVAL; 4566 4548 } 4567 4549 4568 - state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); 4550 + state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE); 4569 4551 if (!state) { 4570 4552 bpf_dynptr_set_null(ptr); 4571 4553 return -ENOMEM; ··· 4597 4579 return 0; 4598 4580 4599 4581 freader_cleanup(&df->freader); 4600 - bpf_mem_free(&bpf_global_ma, df); 4582 + kfree_nolock(df); 4601 4583 bpf_dynptr_set_null(ptr); 4602 4584 return 0; 4603 4585 }

Configure Feed

Configure Feed