Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

bpf: Support lockless unlink when freeing map or local storage

Introduce bpf_selem_unlink_nofail() to properly handle errors returned
from rqspinlock in bpf_local_storage_map_free() and
bpf_local_storage_destroy() where the operation must succeeds.

The idea of bpf_selem_unlink_nofail() is to allow an selem to be
partially linked and use atomic operation on a bit field, selem->state,
to determine when and who can free the selem if any unlink under lock
fails. An selem initially is fully linked to a map and a local storage.
Under normal circumstances, bpf_selem_unlink_nofail() will be able to
grab locks and unlink a selem from map and local storage in sequeunce,
just like bpf_selem_unlink(), and then free it after an RCU grace period.
However, if any of the lock attempts fails, it will only clear
SDATA(selem)->smap or selem->local_storage depending on the caller and
set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the
caller. Then, after both map_free() and destroy() see the selem and the
state becomes SELEM_UNLINKED, one of two racing caller can succeed in
cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no
double free or memory leak.

To make sure bpf_obj_free_fields() is done only once and when map is
still present, it is called when unlinking an selem from b->list under
b->lock.

To make sure uncharging memory is done only when the owner is still
present in map_free(), block destroy() from returning until there is no
pending map_free().

Since smap may not be valid in destroy(), bpf_selem_unlink_nofail()
skips bpf_selem_unlink_storage_nolock_misc() when called from destroy().
This is okay as bpf_local_storage_destroy() will return the remaining
amount of memory charge tracked by mem_charge to the owner to uncharge.
It is also safe to skip clearing local_storage->owner and owner_storage
as the owner is being freed and no users or bpf programs should be able
to reference the owner and using local_storage.

Finally, access of selem, SDATA(selem)->smap and selem->local_storage
are racy. Callers will protect these fields with RCU.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Co-developed-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com

authored by

Amery Hung and committed by
Martin KaFai Lau
5d800f87 c8be3da1

+118 -7
+8 -1
include/linux/bpf_local_storage.h
··· 68 68 u8 data[] __aligned(8); 69 69 }; 70 70 71 + #define SELEM_MAP_UNLINKED (1 << 0) 72 + #define SELEM_STORAGE_UNLINKED (1 << 1) 73 + #define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED) 74 + #define SELEM_TOFREE (1 << 2) 75 + 71 76 /* Linked to bpf_local_storage and bpf_local_storage_map */ 72 77 struct bpf_local_storage_elem { 73 78 struct hlist_node map_node; /* Linked to bpf_local_storage_map */ ··· 85 80 * after raw_spin_unlock 86 81 */ 87 82 }; 83 + atomic_t state; 88 84 bool use_kmalloc_nolock; 89 - /* 7 bytes hole */ 85 + /* 3 bytes hole */ 90 86 /* The data is stored in another cacheline to minimize 91 87 * the number of cachelines access during a cache hit. 92 88 */ ··· 103 97 struct rcu_head rcu; 104 98 rqspinlock_t lock; /* Protect adding/removing from the "list" */ 105 99 u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ 100 + refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ 106 101 bool use_kmalloc_nolock; 107 102 }; 108 103
+110 -6
kernel/bpf/bpf_local_storage.c
··· 85 85 86 86 if (selem) { 87 87 RCU_INIT_POINTER(SDATA(selem)->smap, smap); 88 + atomic_set(&selem->state, 0); 88 89 selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; 89 90 90 91 if (value) { ··· 195 194 /* The bpf_local_storage_map_free will wait for rcu_barrier */ 196 195 smap = rcu_dereference_check(SDATA(selem)->smap, 1); 197 196 198 - migrate_disable(); 199 - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 200 - migrate_enable(); 197 + if (smap) { 198 + migrate_disable(); 199 + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 200 + migrate_enable(); 201 + } 201 202 kfree_nolock(selem); 202 203 } 203 204 ··· 224 221 * is only supported in task local storage, where 225 222 * smap->use_kmalloc_nolock == true. 226 223 */ 227 - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 224 + if (smap) 225 + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 228 226 __bpf_selem_free(selem, reuse_now); 229 227 return; 230 228 } ··· 259 255 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, 260 256 struct bpf_local_storage_map *smap, 261 257 struct bpf_local_storage *local_storage, 262 - bool free_local_storage) 258 + bool free_local_storage, bool pin_owner) 263 259 { 264 260 void *owner = local_storage->owner; 265 261 u32 uncharge = smap->elem_size; ··· 267 263 if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == 268 264 SDATA(selem)) 269 265 RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); 266 + 267 + if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) 268 + return; 270 269 271 270 uncharge += free_local_storage ? sizeof(*local_storage) : 0; 272 271 mem_uncharge(smap, local_storage->owner, uncharge); ··· 281 274 /* After this RCU_INIT, owner may be freed and cannot be used */ 282 275 RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); 283 276 } 277 + 278 + if (pin_owner) 279 + refcount_dec(&local_storage->owner_refcnt); 284 280 } 285 281 286 282 /* local_storage->lock must be held and selem->local_storage == local_storage. ··· 303 293 &local_storage->list); 304 294 305 295 bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, 306 - free_local_storage); 296 + free_local_storage, false); 307 297 308 298 hlist_del_init_rcu(&selem->snode); 309 299 ··· 419 409 return err; 420 410 } 421 411 412 + /* 413 + * Unlink an selem from map and local storage with lockless fallback if callers 414 + * are racing or rqspinlock returns error. It should only be called by 415 + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). 416 + */ 417 + static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, 418 + struct bpf_local_storage_map_bucket *b) 419 + { 420 + bool in_map_free = !!b, free_storage = false; 421 + struct bpf_local_storage *local_storage; 422 + struct bpf_local_storage_map *smap; 423 + unsigned long flags; 424 + int err, unlink = 0; 425 + 426 + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); 427 + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 428 + 429 + if (smap) { 430 + b = b ? : select_bucket(smap, local_storage); 431 + err = raw_res_spin_lock_irqsave(&b->lock, flags); 432 + if (!err) { 433 + /* 434 + * Call bpf_obj_free_fields() under b->lock to make sure it is done 435 + * exactly once for an selem. Safe to free special fields immediately 436 + * as no BPF program should be referencing the selem. 437 + */ 438 + if (likely(selem_linked_to_map(selem))) { 439 + hlist_del_init_rcu(&selem->map_node); 440 + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 441 + unlink++; 442 + } 443 + raw_res_spin_unlock_irqrestore(&b->lock, flags); 444 + } 445 + /* 446 + * Highly unlikely scenario: resource leak 447 + * 448 + * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing 449 + * and both selem belong to the same bucket, if destroy(selem2) acquired 450 + * b->lock and block for too long, neither map_free(selem1) and 451 + * destroy(selem1) will be able to free the special field associated 452 + * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. 453 + */ 454 + WARN_ON_ONCE(err && in_map_free); 455 + if (!err || in_map_free) 456 + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); 457 + } 458 + 459 + if (local_storage) { 460 + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); 461 + if (!err) { 462 + if (likely(selem_linked_to_storage(selem))) { 463 + free_storage = hlist_is_singular_node(&selem->snode, 464 + &local_storage->list); 465 + /* 466 + * Okay to skip clearing owner_storage and storage->owner in 467 + * destroy() since the owner is going away. No user or bpf 468 + * programs should be able to reference it. 469 + */ 470 + if (smap && in_map_free) 471 + bpf_selem_unlink_storage_nolock_misc( 472 + selem, smap, local_storage, 473 + free_storage, true); 474 + hlist_del_init_rcu(&selem->snode); 475 + unlink++; 476 + } 477 + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); 478 + } 479 + if (!err || !in_map_free) 480 + RCU_INIT_POINTER(selem->local_storage, NULL); 481 + } 482 + 483 + if (unlink != 2) 484 + atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); 485 + 486 + /* 487 + * Normally, an selem can be unlinked under local_storage->lock and b->lock, and 488 + * then freed after an RCU grace period. However, if destroy() and map_free() are 489 + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free 490 + * the selem only after both map_free() and destroy() see the selem. 491 + */ 492 + if (unlink == 2 || 493 + atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) 494 + bpf_selem_free(selem, true); 495 + 496 + if (free_storage) 497 + bpf_local_storage_free(local_storage, true); 498 + } 499 + 422 500 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, 423 501 struct bpf_local_storage_map *smap, 424 502 struct bpf_local_storage_elem *selem) ··· 573 475 storage->owner = owner; 574 476 storage->mem_charge = sizeof(*storage); 575 477 storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; 478 + refcount_set(&storage->owner_refcnt, 1); 576 479 577 480 bpf_selem_link_storage_nolock(storage, first_selem); 578 481 ··· 842 743 843 744 if (free_storage) 844 745 bpf_local_storage_free(local_storage, true); 746 + 747 + if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { 748 + while (refcount_read(&local_storage->owner_refcnt)) 749 + cpu_relax(); 750 + } 845 751 } 846 752 847 753 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)