Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage

Replace bpf memory allocator with kmalloc_nolock() to reduce memory
wastage due to preallocation.

In bpf_selem_free(), an selem now needs to wait for a RCU grace period
before being freed when reuse_now == true. Therefore, rcu_barrier()
should be always be called in bpf_local_storage_map_free().

In bpf_local_storage_free(), since smap->storage_ma is no longer needed
to return the memory, the function is now independent from smap.

Remove the outdated comment in bpf_local_storage_alloc(). We already
free selem after an RCU grace period in bpf_local_storage_update() when
bpf_local_storage_alloc() failed the cmpxchg since commit c0d63f309186
("bpf: Add bpf_selem_free()").

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20251114201329.3275875-5-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Amery Hung and committed by
Alexei Starovoitov
f484f4a3 39a460c4

+53 -117
+3 -5
include/linux/bpf_local_storage.h
··· 53 53 u32 bucket_log; 54 54 u16 elem_size; 55 55 u16 cache_idx; 56 - struct bpf_mem_alloc selem_ma; 57 - struct bpf_mem_alloc storage_ma; 58 - bool bpf_ma; 56 + bool use_kmalloc_nolock; 59 57 }; 60 58 61 59 struct bpf_local_storage_data { ··· 95 97 */ 96 98 struct rcu_head rcu; 97 99 raw_spinlock_t lock; /* Protect adding/removing from the "list" */ 98 - bool bpf_ma; 100 + bool use_kmalloc_nolock; 99 101 }; 100 102 101 103 /* U16_MAX is much more than enough for sk local storage ··· 129 131 struct bpf_map * 130 132 bpf_local_storage_map_alloc(union bpf_attr *attr, 131 133 struct bpf_local_storage_cache *cache, 132 - bool bpf_ma); 134 + bool use_kmalloc_nolock); 133 135 134 136 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, 135 137 struct bpf_local_storage_map *smap,
+50 -112
kernel/bpf/bpf_local_storage.c
··· 80 80 if (mem_charge(smap, owner, smap->elem_size)) 81 81 return NULL; 82 82 83 - if (smap->bpf_ma) { 84 - selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); 85 - if (selem) 86 - /* Keep the original bpf_map_kzalloc behavior 87 - * before started using the bpf_mem_cache_alloc. 88 - * 89 - * No need to use zero_map_value. The bpf_selem_free() 90 - * only does bpf_mem_cache_free when there is 91 - * no other bpf prog is using the selem. 92 - */ 93 - memset(SDATA(selem)->data, 0, smap->map.value_size); 83 + if (smap->use_kmalloc_nolock) { 84 + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, 85 + __GFP_ZERO, NUMA_NO_NODE); 94 86 } else { 95 87 selem = bpf_map_kzalloc(&smap->map, smap->elem_size, 96 88 gfp_flags | __GFP_NOWARN); ··· 105 113 return NULL; 106 114 } 107 115 108 - /* rcu tasks trace callback for bpf_ma == false */ 116 + /* rcu tasks trace callback for use_kmalloc_nolock == false */ 109 117 static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) 110 118 { 111 119 struct bpf_local_storage *local_storage; ··· 120 128 kfree_rcu(local_storage, rcu); 121 129 } 122 130 123 - static void bpf_local_storage_free_rcu(struct rcu_head *rcu) 124 - { 125 - struct bpf_local_storage *local_storage; 126 - 127 - local_storage = container_of(rcu, struct bpf_local_storage, rcu); 128 - bpf_mem_cache_raw_free(local_storage); 129 - } 130 - 131 - static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) 132 - { 133 - if (rcu_trace_implies_rcu_gp()) 134 - bpf_local_storage_free_rcu(rcu); 135 - else 136 - call_rcu(rcu, bpf_local_storage_free_rcu); 137 - } 138 - 139 - /* Handle bpf_ma == false */ 131 + /* Handle use_kmalloc_nolock == false */ 140 132 static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, 141 133 bool vanilla_rcu) 142 134 { ··· 131 155 __bpf_local_storage_free_trace_rcu); 132 156 } 133 157 158 + static void bpf_local_storage_free_rcu(struct rcu_head *rcu) 159 + { 160 + struct bpf_local_storage *local_storage; 161 + 162 + local_storage = container_of(rcu, struct bpf_local_storage, rcu); 163 + kfree_nolock(local_storage); 164 + } 165 + 166 + static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) 167 + { 168 + if (rcu_trace_implies_rcu_gp()) 169 + bpf_local_storage_free_rcu(rcu); 170 + else 171 + call_rcu(rcu, bpf_local_storage_free_rcu); 172 + } 173 + 134 174 static void bpf_local_storage_free(struct bpf_local_storage *local_storage, 135 - struct bpf_local_storage_map *smap, 136 175 bool reuse_now) 137 176 { 138 177 if (!local_storage) 139 178 return; 140 179 141 - if (!local_storage->bpf_ma) { 180 + if (!local_storage->use_kmalloc_nolock) { 142 181 __bpf_local_storage_free(local_storage, reuse_now); 143 182 return; 144 183 } 145 184 146 - if (!reuse_now) { 147 - call_rcu_tasks_trace(&local_storage->rcu, 148 - bpf_local_storage_free_trace_rcu); 185 + if (reuse_now) { 186 + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); 149 187 return; 150 188 } 151 189 152 - if (smap) 153 - bpf_mem_cache_free(&smap->storage_ma, local_storage); 154 - else 155 - /* smap could be NULL if the selem that triggered 156 - * this 'local_storage' creation had been long gone. 157 - * In this case, directly do call_rcu(). 158 - */ 159 - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); 190 + call_rcu_tasks_trace(&local_storage->rcu, 191 + bpf_local_storage_free_trace_rcu); 160 192 } 161 193 162 - /* rcu tasks trace callback for bpf_ma == false */ 194 + /* rcu tasks trace callback for use_kmalloc_nolock == false */ 163 195 static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) 164 196 { 165 197 struct bpf_local_storage_elem *selem; ··· 179 195 kfree_rcu(selem, rcu); 180 196 } 181 197 182 - /* Handle bpf_ma == false */ 198 + /* Handle use_kmalloc_nolock == false */ 183 199 static void __bpf_selem_free(struct bpf_local_storage_elem *selem, 184 200 bool vanilla_rcu) 185 201 { ··· 201 217 migrate_disable(); 202 218 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 203 219 migrate_enable(); 204 - bpf_mem_cache_raw_free(selem); 220 + kfree_nolock(selem); 205 221 } 206 222 207 223 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) ··· 219 235 220 236 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); 221 237 222 - if (!smap->bpf_ma) { 223 - /* Only task storage has uptrs and task storage 224 - * has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true 225 - * for task storage, so this bpf_obj_free_fields() won't unpin 226 - * any uptr. 238 + if (!smap->use_kmalloc_nolock) { 239 + /* 240 + * No uptr will be unpin even when reuse_now == false since uptr 241 + * is only supported in task local storage, where 242 + * smap->use_kmalloc_nolock == true. 227 243 */ 228 244 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 229 245 __bpf_selem_free(selem, reuse_now); ··· 231 247 } 232 248 233 249 if (reuse_now) { 234 - /* reuse_now == true only happens when the storage owner 235 - * (e.g. task_struct) is being destructed or the map itself 236 - * is being destructed (ie map_free). In both cases, 237 - * no bpf prog can have a hold on the selem. It is 238 - * safe to unpin the uptrs and free the selem now. 250 + /* 251 + * While it is okay to call bpf_obj_free_fields() that unpins uptr when 252 + * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. 239 253 */ 240 - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 241 - /* Instead of using the vanilla call_rcu(), 242 - * bpf_mem_cache_free will be able to reuse selem 243 - * immediately. 244 - */ 245 - bpf_mem_cache_free(&smap->selem_ma, selem); 254 + call_rcu(&selem->rcu, bpf_selem_free_rcu); 246 255 return; 247 256 } 248 257 ··· 316 339 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, 317 340 bool reuse_now) 318 341 { 319 - struct bpf_local_storage_map *storage_smap; 320 342 struct bpf_local_storage *local_storage; 321 343 bool free_local_storage = false; 322 344 HLIST_HEAD(selem_free_list); ··· 327 351 328 352 local_storage = rcu_dereference_check(selem->local_storage, 329 353 bpf_rcu_lock_held()); 330 - storage_smap = rcu_dereference_check(local_storage->smap, 331 - bpf_rcu_lock_held()); 332 354 333 355 raw_spin_lock_irqsave(&local_storage->lock, flags); 334 356 if (likely(selem_linked_to_storage(selem))) ··· 337 363 bpf_selem_free_list(&selem_free_list, reuse_now); 338 364 339 365 if (free_local_storage) 340 - bpf_local_storage_free(local_storage, storage_smap, reuse_now); 366 + bpf_local_storage_free(local_storage, reuse_now); 341 367 } 342 368 343 369 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, ··· 430 456 if (err) 431 457 return err; 432 458 433 - if (smap->bpf_ma) 434 - storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); 459 + if (smap->use_kmalloc_nolock) 460 + storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), 461 + __GFP_ZERO, NUMA_NO_NODE); 435 462 else 436 463 storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), 437 464 gfp_flags | __GFP_NOWARN); ··· 445 470 INIT_HLIST_HEAD(&storage->list); 446 471 raw_spin_lock_init(&storage->lock); 447 472 storage->owner = owner; 448 - storage->bpf_ma = smap->bpf_ma; 473 + storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; 449 474 450 475 bpf_selem_link_storage_nolock(storage, first_selem); 451 476 bpf_selem_link_map(smap, first_selem); ··· 467 492 bpf_selem_unlink_map(first_selem); 468 493 err = -EAGAIN; 469 494 goto uncharge; 470 - 471 - /* Note that even first_selem was linked to smap's 472 - * bucket->list, first_selem can be freed immediately 473 - * (instead of kfree_rcu) because 474 - * bpf_local_storage_map_free() does a 475 - * synchronize_rcu_mult (waiting for both sleepable and 476 - * normal programs) before walking the bucket->list. 477 - * Hence, no one is accessing selem from the 478 - * bucket->list under rcu_read_lock(). 479 - */ 480 495 } 481 496 482 497 return 0; 483 498 484 499 uncharge: 485 - bpf_local_storage_free(storage, smap, true); 500 + bpf_local_storage_free(storage, true); 486 501 mem_uncharge(smap, owner, sizeof(*storage)); 487 502 return err; 488 503 } ··· 659 694 660 695 void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) 661 696 { 662 - struct bpf_local_storage_map *storage_smap; 663 697 struct bpf_local_storage_elem *selem; 664 698 bool free_storage = false; 665 699 HLIST_HEAD(free_selem_list); 666 700 struct hlist_node *n; 667 701 unsigned long flags; 668 - 669 - storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); 670 702 671 703 /* Neither the bpf_prog nor the bpf_map's syscall 672 704 * could be modifying the local_storage->list now. ··· 694 732 bpf_selem_free_list(&free_selem_list, true); 695 733 696 734 if (free_storage) 697 - bpf_local_storage_free(local_storage, storage_smap, true); 735 + bpf_local_storage_free(local_storage, true); 698 736 } 699 737 700 738 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) ··· 707 745 return usage; 708 746 } 709 747 710 - /* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. 711 - * A deadlock free allocator is useful for storage that the bpf prog can easily 712 - * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. 713 - * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses 714 - * memory immediately. To be reuse-immediate safe, the owner destruction 715 - * code path needs to go through a rcu grace period before calling 716 - * bpf_local_storage_destroy(). 717 - * 718 - * When bpf_ma == false, the kmalloc and kfree are used. 719 - */ 720 748 struct bpf_map * 721 749 bpf_local_storage_map_alloc(union bpf_attr *attr, 722 750 struct bpf_local_storage_cache *cache, 723 - bool bpf_ma) 751 + bool use_kmalloc_nolock) 724 752 { 725 753 struct bpf_local_storage_map *smap; 726 754 unsigned int i; ··· 744 792 745 793 /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non 746 794 * preemptible context. Thus, enforce all storages to use 747 - * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. 795 + * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. 748 796 */ 749 - smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; 750 - if (smap->bpf_ma) { 751 - err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); 752 - if (err) 753 - goto free_smap; 754 - 755 - err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); 756 - if (err) { 757 - bpf_mem_alloc_destroy(&smap->selem_ma); 758 - goto free_smap; 759 - } 760 - } 797 + smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; 761 798 762 799 smap->cache_idx = bpf_local_storage_cache_idx_get(cache); 763 800 return &smap->map; ··· 816 875 */ 817 876 synchronize_rcu(); 818 877 819 - if (smap->bpf_ma) { 878 + if (smap->use_kmalloc_nolock) { 820 879 rcu_barrier_tasks_trace(); 821 - if (!rcu_trace_implies_rcu_gp()) 822 - rcu_barrier(); 823 - bpf_mem_alloc_destroy(&smap->selem_ma); 824 - bpf_mem_alloc_destroy(&smap->storage_ma); 880 + rcu_barrier(); 825 881 } 826 882 kvfree(smap->buckets); 827 883 bpf_map_area_free(smap);