Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'bpf-reduce-the-use-of-migrate_-disable-enable'

Hou Tao says:

====================
The use of migrate_{disable|enable} pair in BPF is mainly due to the
introduction of bpf memory allocator and the use of per-CPU data struct
in its internal implementation. The caller needs to disable migration
before invoking the alloc or free APIs of bpf memory allocator, and
enable migration after the invocation.

The main users of bpf memory allocator are various kind of bpf maps in
which the map values or the special fields in the map values are
allocated by using bpf memory allocator.

At present, the running context for bpf program has already disabled
migration explictly or implictly, therefore, when these maps are
manipulated in bpf program, it is OK to not invoke migrate_disable()
and migrate_enable() pair. Howevers, it is not always the case when
these maps are manipulated through bpf syscall, therefore many
migrate_{disable|enable} pairs are added when the map can either be
manipulated by BPF program or BPF syscall.

The initial idea of reducing the use of migrate_{disable|enable} comes
from Alexei [1]. I turned it into a patch set that archives the goals
through the following three methods:

1. remove unnecessary migrate_{disable|enable} pair
when the BPF syscall path also disables migration, it is OK to remove
the pair. Patch #1~#3 fall into this category, while patch #4~#5 are
partially included.

2. move the migrate_{disable|enable} pair from inner callee to outer
caller
Instead of invoking migrate_disable() in the inner callee, invoking
migrate_disable() in the outer caller to simplify reasoning about when
migrate_disable() is needed. Patch #4~#5 and patch #6~#19 belongs to
this category.

3. add cant_migrate() check in the inner callee
Add cant_migrate() check in the inner callee to ensure the guarantee
that migration is disabled is not broken. Patch #1~#5, #13, #16~#19 also
belong to this category.

Please check the individual patches for more details. Comments are
always welcome.

Change Log:
v2:
* sqaush the ->map_free related patches (#10~#12, #15) into one patch
* remove unnecessary cant_migrate() checks.

v1: https://lore.kernel.org/bpf/20250106081900.1665573-1-houtao@huaweicloud.com
====================

Link: https://patch.msgid.link/20250108010728.207536-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+55 -88
+2 -4
kernel/bpf/arraymap.c
··· 735 735 u64 ret = 0; 736 736 void *val; 737 737 738 + cant_migrate(); 739 + 738 740 if (flags != 0) 739 741 return -EINVAL; 740 742 741 743 is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 742 744 array = container_of(map, struct bpf_array, map); 743 - if (is_percpu) 744 - migrate_disable(); 745 745 for (i = 0; i < map->max_entries; i++) { 746 746 if (is_percpu) 747 747 val = this_cpu_ptr(array->pptrs[i]); ··· 756 756 break; 757 757 } 758 758 759 - if (is_percpu) 760 - migrate_enable(); 761 759 return num_elems; 762 760 } 763 761
+7 -8
kernel/bpf/bpf_cgrp_storage.c
··· 15 15 16 16 static void bpf_cgrp_storage_lock(void) 17 17 { 18 - migrate_disable(); 18 + cant_migrate(); 19 19 this_cpu_inc(bpf_cgrp_storage_busy); 20 20 } 21 21 22 22 static void bpf_cgrp_storage_unlock(void) 23 23 { 24 24 this_cpu_dec(bpf_cgrp_storage_busy); 25 - migrate_enable(); 26 25 } 27 26 28 27 static bool bpf_cgrp_storage_trylock(void) 29 28 { 30 - migrate_disable(); 29 + cant_migrate(); 31 30 if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { 32 31 this_cpu_dec(bpf_cgrp_storage_busy); 33 - migrate_enable(); 34 32 return false; 35 33 } 36 34 return true; ··· 45 47 { 46 48 struct bpf_local_storage *local_storage; 47 49 50 + migrate_disable(); 48 51 rcu_read_lock(); 49 52 local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); 50 - if (!local_storage) { 51 - rcu_read_unlock(); 52 - return; 53 - } 53 + if (!local_storage) 54 + goto out; 54 55 55 56 bpf_cgrp_storage_lock(); 56 57 bpf_local_storage_destroy(local_storage); 57 58 bpf_cgrp_storage_unlock(); 59 + out: 58 60 rcu_read_unlock(); 61 + migrate_enable(); 59 62 } 60 63 61 64 static struct bpf_local_storage_data *
+5 -4
kernel/bpf/bpf_inode_storage.c
··· 62 62 if (!bsb) 63 63 return; 64 64 65 + migrate_disable(); 65 66 rcu_read_lock(); 66 67 67 68 local_storage = rcu_dereference(bsb->storage); 68 - if (!local_storage) { 69 - rcu_read_unlock(); 70 - return; 71 - } 69 + if (!local_storage) 70 + goto out; 72 71 73 72 bpf_local_storage_destroy(local_storage); 73 + out: 74 74 rcu_read_unlock(); 75 + migrate_enable(); 75 76 } 76 77 77 78 static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
+9 -21
kernel/bpf/bpf_local_storage.c
··· 81 81 return NULL; 82 82 83 83 if (smap->bpf_ma) { 84 - migrate_disable(); 85 84 selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); 86 - migrate_enable(); 87 85 if (selem) 88 86 /* Keep the original bpf_map_kzalloc behavior 89 87 * before started using the bpf_mem_cache_alloc. ··· 172 174 return; 173 175 } 174 176 175 - if (smap) { 176 - migrate_disable(); 177 + if (smap) 177 178 bpf_mem_cache_free(&smap->storage_ma, local_storage); 178 - migrate_enable(); 179 - } else { 179 + else 180 180 /* smap could be NULL if the selem that triggered 181 181 * this 'local_storage' creation had been long gone. 182 182 * In this case, directly do call_rcu(). 183 183 */ 184 184 call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); 185 - } 186 185 } 187 186 188 187 /* rcu tasks trace callback for bpf_ma == false */ ··· 212 217 selem = container_of(rcu, struct bpf_local_storage_elem, rcu); 213 218 /* The bpf_local_storage_map_free will wait for rcu_barrier */ 214 219 smap = rcu_dereference_check(SDATA(selem)->smap, 1); 220 + 221 + migrate_disable(); 215 222 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); 223 + migrate_enable(); 216 224 bpf_mem_cache_raw_free(selem); 217 225 } 218 226 ··· 254 256 * bpf_mem_cache_free will be able to reuse selem 255 257 * immediately. 256 258 */ 257 - migrate_disable(); 258 259 bpf_mem_cache_free(&smap->selem_ma, selem); 259 - migrate_enable(); 260 260 return; 261 261 } 262 262 ··· 493 497 if (err) 494 498 return err; 495 499 496 - if (smap->bpf_ma) { 497 - migrate_disable(); 500 + if (smap->bpf_ma) 498 501 storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); 499 - migrate_enable(); 500 - } else { 502 + else 501 503 storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), 502 504 gfp_flags | __GFP_NOWARN); 503 - } 504 - 505 505 if (!storage) { 506 506 err = -ENOMEM; 507 507 goto uncharge; ··· 894 902 while ((selem = hlist_entry_safe( 895 903 rcu_dereference_raw(hlist_first_rcu(&b->list)), 896 904 struct bpf_local_storage_elem, map_node))) { 897 - if (busy_counter) { 898 - migrate_disable(); 905 + if (busy_counter) 899 906 this_cpu_inc(*busy_counter); 900 - } 901 907 bpf_selem_unlink(selem, true); 902 - if (busy_counter) { 908 + if (busy_counter) 903 909 this_cpu_dec(*busy_counter); 904 - migrate_enable(); 905 - } 906 910 cond_resched_rcu(); 907 911 } 908 912 rcu_read_unlock();
+7 -8
kernel/bpf/bpf_task_storage.c
··· 24 24 25 25 static void bpf_task_storage_lock(void) 26 26 { 27 - migrate_disable(); 27 + cant_migrate(); 28 28 this_cpu_inc(bpf_task_storage_busy); 29 29 } 30 30 31 31 static void bpf_task_storage_unlock(void) 32 32 { 33 33 this_cpu_dec(bpf_task_storage_busy); 34 - migrate_enable(); 35 34 } 36 35 37 36 static bool bpf_task_storage_trylock(void) 38 37 { 39 - migrate_disable(); 38 + cant_migrate(); 40 39 if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { 41 40 this_cpu_dec(bpf_task_storage_busy); 42 - migrate_enable(); 43 41 return false; 44 42 } 45 43 return true; ··· 70 72 { 71 73 struct bpf_local_storage *local_storage; 72 74 75 + migrate_disable(); 73 76 rcu_read_lock(); 74 77 75 78 local_storage = rcu_dereference(task->bpf_storage); 76 - if (!local_storage) { 77 - rcu_read_unlock(); 78 - return; 79 - } 79 + if (!local_storage) 80 + goto out; 80 81 81 82 bpf_task_storage_lock(); 82 83 bpf_local_storage_destroy(local_storage); 83 84 bpf_task_storage_unlock(); 85 + out: 84 86 rcu_read_unlock(); 87 + migrate_enable(); 85 88 } 86 89 87 90 static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
-2
kernel/bpf/cpumask.c
··· 91 91 if (!refcount_dec_and_test(&cpumask->usage)) 92 92 return; 93 93 94 - migrate_disable(); 95 94 bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask); 96 - migrate_enable(); 97 95 } 98 96 99 97 __bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
+7 -12
kernel/bpf/hashtab.c
··· 897 897 { 898 898 check_and_free_fields(htab, l); 899 899 900 - migrate_disable(); 901 900 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) 902 901 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); 903 902 bpf_mem_cache_free(&htab->ma, l); 904 - migrate_enable(); 905 903 } 906 904 907 905 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) ··· 1500 1502 { 1501 1503 int i; 1502 1504 1503 - /* It's called from a worker thread, so disable migration here, 1504 - * since bpf_mem_cache_free() relies on that. 1505 + /* It's called from a worker thread and migration has been disabled, 1506 + * therefore, it is OK to invoke bpf_mem_cache_free() directly. 1505 1507 */ 1506 - migrate_disable(); 1507 1508 for (i = 0; i < htab->n_buckets; i++) { 1508 1509 struct hlist_nulls_head *head = select_bucket(htab, i); 1509 1510 struct hlist_nulls_node *n; ··· 1514 1517 } 1515 1518 cond_resched(); 1516 1519 } 1517 - migrate_enable(); 1518 1520 } 1519 1521 1520 1522 static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) ··· 2204 2208 bool is_percpu; 2205 2209 u64 ret = 0; 2206 2210 2211 + cant_migrate(); 2212 + 2207 2213 if (flags != 0) 2208 2214 return -EINVAL; 2209 2215 2210 2216 is_percpu = htab_is_percpu(htab); 2211 2217 2212 2218 roundup_key_size = round_up(map->key_size, 8); 2213 - /* disable migration so percpu value prepared here will be the 2214 - * same as the one seen by the bpf program with bpf_map_lookup_elem(). 2219 + /* migration has been disabled, so percpu value prepared here will be 2220 + * the same as the one seen by the bpf program with 2221 + * bpf_map_lookup_elem(). 2215 2222 */ 2216 - if (is_percpu) 2217 - migrate_disable(); 2218 2223 for (i = 0; i < htab->n_buckets; i++) { 2219 2224 b = &htab->buckets[i]; 2220 2225 rcu_read_lock(); ··· 2241 2244 rcu_read_unlock(); 2242 2245 } 2243 2246 out: 2244 - if (is_percpu) 2245 - migrate_enable(); 2246 2247 return num_elems; 2247 2248 } 2248 2249
-4
kernel/bpf/helpers.c
··· 2066 2066 /* The contained type can also have resources, including a 2067 2067 * bpf_list_head which needs to be freed. 2068 2068 */ 2069 - migrate_disable(); 2070 2069 __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); 2071 - migrate_enable(); 2072 2070 } 2073 2071 } 2074 2072 ··· 2103 2105 obj -= field->graph_root.node_offset; 2104 2106 2105 2107 2106 - migrate_disable(); 2107 2108 __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); 2108 - migrate_enable(); 2109 2109 } 2110 2110 } 2111 2111
+4 -16
kernel/bpf/lpm_trie.c
··· 289 289 } 290 290 291 291 static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, 292 - const void *value, 293 - bool disable_migration) 292 + const void *value) 294 293 { 295 294 struct lpm_trie_node *node; 296 295 297 - if (disable_migration) 298 - migrate_disable(); 299 296 node = bpf_mem_cache_alloc(&trie->ma); 300 - if (disable_migration) 301 - migrate_enable(); 302 297 303 298 if (!node) 304 299 return NULL; ··· 337 342 if (key->prefixlen > trie->max_prefixlen) 338 343 return -EINVAL; 339 344 340 - /* Allocate and fill a new node. Need to disable migration before 341 - * invoking bpf_mem_cache_alloc(). 342 - */ 343 - new_node = lpm_trie_node_alloc(trie, value, true); 345 + /* Allocate and fill a new node */ 346 + new_node = lpm_trie_node_alloc(trie, value); 344 347 if (!new_node) 345 348 return -ENOMEM; 346 349 ··· 418 425 goto out; 419 426 } 420 427 421 - /* migration is disabled within the locked scope */ 422 - im_node = lpm_trie_node_alloc(trie, NULL, false); 428 + im_node = lpm_trie_node_alloc(trie, NULL); 423 429 if (!im_node) { 424 430 trie->n_entries--; 425 431 ret = -ENOMEM; ··· 444 452 out: 445 453 raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 446 454 447 - migrate_disable(); 448 455 if (ret) 449 456 bpf_mem_cache_free(&trie->ma, new_node); 450 457 bpf_mem_cache_free_rcu(&trie->ma, free_node); 451 - migrate_enable(); 452 458 453 459 return ret; 454 460 } ··· 545 555 out: 546 556 raw_spin_unlock_irqrestore(&trie->lock, irq_flags); 547 557 548 - migrate_disable(); 549 558 bpf_mem_cache_free_rcu(&trie->ma, free_parent); 550 559 bpf_mem_cache_free_rcu(&trie->ma, free_node); 551 - migrate_enable(); 552 560 553 561 return ret; 554 562 }
-2
kernel/bpf/range_tree.c
··· 259 259 260 260 while ((rn = range_it_iter_first(rt, 0, -1U))) { 261 261 range_it_remove(rn, rt); 262 - migrate_disable(); 263 262 bpf_mem_free(&bpf_global_ma, rn); 264 - migrate_enable(); 265 263 } 266 264 } 267 265
+7 -3
kernel/bpf/syscall.c
··· 796 796 if (!btf_is_kernel(field->kptr.btf)) { 797 797 pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, 798 798 field->kptr.btf_id); 799 - migrate_disable(); 800 799 __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? 801 800 pointee_struct_meta->record : NULL, 802 801 fields[i].type == BPF_KPTR_PERCPU); 803 - migrate_enable(); 804 802 } else { 805 803 field->kptr.dtor(xchgd_field); 806 804 } ··· 833 835 struct btf_record *rec = map->record; 834 836 struct btf *btf = map->btf; 835 837 836 - /* implementation dependent freeing */ 838 + /* implementation dependent freeing. Disabling migration to simplify 839 + * the free of values or special fields allocated from bpf memory 840 + * allocator. 841 + */ 842 + migrate_disable(); 837 843 map->ops->map_free(map); 844 + migrate_enable(); 845 + 838 846 /* Delay freeing of btf_record for maps, as map_free 839 847 * callback usually needs access to them. It is better to do it here 840 848 * than require each callback to do the free itself manually.
+7 -4
net/core/bpf_sk_storage.c
··· 50 50 { 51 51 struct bpf_local_storage *sk_storage; 52 52 53 + migrate_disable(); 53 54 rcu_read_lock(); 54 55 sk_storage = rcu_dereference(sk->sk_bpf_storage); 55 - if (!sk_storage) { 56 - rcu_read_unlock(); 57 - return; 58 - } 56 + if (!sk_storage) 57 + goto out; 59 58 60 59 bpf_local_storage_destroy(sk_storage); 60 + out: 61 61 rcu_read_unlock(); 62 + migrate_enable(); 62 63 } 63 64 64 65 static void bpf_sk_storage_map_free(struct bpf_map *map) ··· 161 160 162 161 RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); 163 162 163 + migrate_disable(); 164 164 rcu_read_lock(); 165 165 sk_storage = rcu_dereference(sk->sk_bpf_storage); 166 166 ··· 214 212 215 213 out: 216 214 rcu_read_unlock(); 215 + migrate_enable(); 217 216 218 217 /* In case of an error, don't free anything explicitly here, the 219 218 * caller is responsible to call bpf_sk_storage_free.