Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'nf-next-26-01-22' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Florian Westphal says:

====================
netfilter: updates for net-next

There is an issue with interval matching in nftables rbtree set type:
When userspace sends us set updates, there is a brief window where
false negative lookups may occur from the data plane. Quoting Pablos
original cover letter:

This series addresses this issue by translating the rbtree, which keeps
the intervals in order, to binary search. The array is published to
packet path through RCU. The idea is to keep using the rbtree
datastructure for control plane, which needs to deal with updates, then
generate an array using this rbtree for binary search lookups.

Patch #1 allows to call .remove in case .abort is defined, which is
needed by this new approach. Only pipapo needs to skip .remove to speed.

Patch #2 add the binary search array approach for interval matching.

Patch #3 updates .get to use the binary search array to find for
(closest or exact) interval matching.

Patch #4 removes seqcount_rwlock_t as it is not needed anymore (new in
this series).

* tag 'nf-next-26-01-22' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
netfilter: nft_set_rbtree: remove seqcount_rwlock_t
netfilter: nft_set_rbtree: use binary search array in get command
netfilter: nft_set_rbtree: translate rbtree to array for binary search
netfilter: nf_tables: add .abort_skip_removal flag for set types
====================

Link: https://patch.msgid.link/20260122162935.8581-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+288 -142
+2
include/net/netfilter/nf_tables.h
··· 451 451 * @init: initialize private data of new set instance 452 452 * @destroy: destroy private data of set instance 453 453 * @gc_init: initialize garbage collection 454 + * @abort_skip_removal: skip removal of elements from abort path 454 455 * @elemsize: element private size 455 456 * 456 457 * Operations lookup, update and delete have simpler interfaces, are faster ··· 509 508 const struct nft_set *set); 510 509 void (*gc_init)(const struct nft_set *set); 511 510 511 + bool abort_skip_removal; 512 512 unsigned int elemsize; 513 513 }; 514 514
+2 -1
net/netfilter/nf_tables_api.c
··· 7807 7807 continue; 7808 7808 } 7809 7809 7810 - if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) 7810 + if (!te->set->ops->abort_skip_removal || 7811 + nft_setelem_is_catchall(te->set, te->elems[i].priv)) 7811 7812 nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); 7812 7813 7813 7814 if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+2
net/netfilter/nft_set_pipapo.c
··· 2370 2370 .gc_init = nft_pipapo_gc_init, 2371 2371 .commit = nft_pipapo_commit, 2372 2372 .abort = nft_pipapo_abort, 2373 + .abort_skip_removal = true, 2373 2374 .elemsize = offsetof(struct nft_pipapo_elem, ext), 2374 2375 }, 2375 2376 }; ··· 2395 2394 .gc_init = nft_pipapo_gc_init, 2396 2395 .commit = nft_pipapo_commit, 2397 2396 .abort = nft_pipapo_abort, 2397 + .abort_skip_removal = true, 2398 2398 .elemsize = offsetof(struct nft_pipapo_elem, ext), 2399 2399 }, 2400 2400 };
+282 -141
net/netfilter/nft_set_rbtree.c
··· 10 10 #include <linux/module.h> 11 11 #include <linux/list.h> 12 12 #include <linux/rbtree.h> 13 + #include <linux/bsearch.h> 13 14 #include <linux/netlink.h> 14 15 #include <linux/netfilter.h> 15 16 #include <linux/netfilter/nf_tables.h> 16 17 #include <net/netfilter/nf_tables_core.h> 17 18 19 + struct nft_array_interval { 20 + struct nft_set_ext *from; 21 + struct nft_set_ext *to; 22 + }; 23 + 24 + struct nft_array { 25 + u32 max_intervals; 26 + u32 num_intervals; 27 + struct nft_array_interval *intervals; 28 + struct rcu_head rcu_head; 29 + }; 30 + 18 31 struct nft_rbtree { 19 32 struct rb_root root; 20 33 rwlock_t lock; 21 - seqcount_rwlock_t count; 34 + struct nft_array __rcu *array; 35 + struct nft_array *array_next; 22 36 unsigned long last_gc; 23 37 }; 24 38 ··· 61 47 set->klen); 62 48 } 63 49 64 - static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) 50 + struct nft_array_lookup_ctx { 51 + const u32 *key; 52 + u32 klen; 53 + }; 54 + 55 + static int nft_array_lookup_cmp(const void *pkey, const void *entry) 65 56 { 66 - return nft_set_elem_expired(&rbe->ext); 67 - } 57 + const struct nft_array_interval *interval = entry; 58 + const struct nft_array_lookup_ctx *ctx = pkey; 59 + int a, b; 68 60 69 - static const struct nft_set_ext * 70 - __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, 71 - const u32 *key, unsigned int seq) 72 - { 73 - struct nft_rbtree *priv = nft_set_priv(set); 74 - const struct nft_rbtree_elem *rbe, *interval = NULL; 75 - u8 genmask = nft_genmask_cur(net); 76 - const struct rb_node *parent; 77 - int d; 61 + if (!interval->from) 62 + return 1; 78 63 79 - parent = rcu_dereference_raw(priv->root.rb_node); 80 - while (parent != NULL) { 81 - if (read_seqcount_retry(&priv->count, seq)) 82 - return NULL; 64 + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); 65 + if (!interval->to) 66 + b = -1; 67 + else 68 + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); 83 69 84 - rbe = rb_entry(parent, struct nft_rbtree_elem, node); 70 + if (a >= 0 && b < 0) 71 + return 0; 85 72 86 - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); 87 - if (d < 0) { 88 - parent = rcu_dereference_raw(parent->rb_left); 89 - if (interval && 90 - !nft_rbtree_cmp(set, rbe, interval) && 91 - nft_rbtree_interval_end(rbe) && 92 - nft_rbtree_interval_start(interval)) 93 - continue; 94 - if (nft_set_elem_active(&rbe->ext, genmask) && 95 - !nft_rbtree_elem_expired(rbe)) 96 - interval = rbe; 97 - } else if (d > 0) 98 - parent = rcu_dereference_raw(parent->rb_right); 99 - else { 100 - if (!nft_set_elem_active(&rbe->ext, genmask)) { 101 - parent = rcu_dereference_raw(parent->rb_left); 102 - continue; 103 - } 73 + if (a < 0) 74 + return -1; 104 75 105 - if (nft_rbtree_elem_expired(rbe)) 106 - return NULL; 107 - 108 - if (nft_rbtree_interval_end(rbe)) { 109 - if (nft_set_is_anonymous(set)) 110 - return NULL; 111 - parent = rcu_dereference_raw(parent->rb_left); 112 - interval = NULL; 113 - continue; 114 - } 115 - 116 - return &rbe->ext; 117 - } 118 - } 119 - 120 - if (set->flags & NFT_SET_INTERVAL && interval != NULL && 121 - nft_rbtree_interval_start(interval)) 122 - return &interval->ext; 123 - 124 - return NULL; 76 + return 1; 125 77 } 126 78 127 79 INDIRECT_CALLABLE_SCOPE ··· 96 116 const u32 *key) 97 117 { 98 118 struct nft_rbtree *priv = nft_set_priv(set); 99 - unsigned int seq = read_seqcount_begin(&priv->count); 100 - const struct nft_set_ext *ext; 119 + struct nft_array *array = rcu_dereference(priv->array); 120 + const struct nft_array_interval *interval; 121 + struct nft_array_lookup_ctx ctx = { 122 + .key = key, 123 + .klen = set->klen, 124 + }; 101 125 102 - ext = __nft_rbtree_lookup(net, set, key, seq); 103 - if (ext || !read_seqcount_retry(&priv->count, seq)) 104 - return ext; 126 + if (!array) 127 + return NULL; 105 128 106 - read_lock_bh(&priv->lock); 107 - seq = read_seqcount_begin(&priv->count); 108 - ext = __nft_rbtree_lookup(net, set, key, seq); 109 - read_unlock_bh(&priv->lock); 129 + interval = bsearch(&ctx, array->intervals, array->num_intervals, 130 + sizeof(struct nft_array_interval), 131 + nft_array_lookup_cmp); 132 + if (!interval || nft_set_elem_expired(interval->from)) 133 + return NULL; 110 134 111 - return ext; 135 + return interval->from; 112 136 } 113 137 114 - static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, 115 - const u32 *key, struct nft_rbtree_elem **elem, 116 - unsigned int seq, unsigned int flags, u8 genmask) 138 + struct nft_array_get_ctx { 139 + const u32 *key; 140 + unsigned int flags; 141 + u32 klen; 142 + }; 143 + 144 + static int nft_array_get_cmp(const void *pkey, const void *entry) 117 145 { 118 - struct nft_rbtree_elem *rbe, *interval = NULL; 119 - struct nft_rbtree *priv = nft_set_priv(set); 120 - const struct rb_node *parent; 121 - const void *this; 122 - int d; 146 + const struct nft_array_interval *interval = entry; 147 + const struct nft_array_get_ctx *ctx = pkey; 148 + int a, b; 123 149 124 - parent = rcu_dereference_raw(priv->root.rb_node); 125 - while (parent != NULL) { 126 - if (read_seqcount_retry(&priv->count, seq)) 127 - return false; 150 + if (!interval->from) 151 + return 1; 128 152 129 - rbe = rb_entry(parent, struct nft_rbtree_elem, node); 153 + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); 154 + if (!interval->to) 155 + b = -1; 156 + else 157 + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); 130 158 131 - this = nft_set_ext_key(&rbe->ext); 132 - d = memcmp(this, key, set->klen); 133 - if (d < 0) { 134 - parent = rcu_dereference_raw(parent->rb_left); 135 - if (!(flags & NFT_SET_ELEM_INTERVAL_END)) 136 - interval = rbe; 137 - } else if (d > 0) { 138 - parent = rcu_dereference_raw(parent->rb_right); 139 - if (flags & NFT_SET_ELEM_INTERVAL_END) 140 - interval = rbe; 141 - } else { 142 - if (!nft_set_elem_active(&rbe->ext, genmask)) { 143 - parent = rcu_dereference_raw(parent->rb_left); 144 - continue; 145 - } 146 - 147 - if (nft_set_elem_expired(&rbe->ext)) 148 - return false; 149 - 150 - if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || 151 - (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == 152 - (flags & NFT_SET_ELEM_INTERVAL_END)) { 153 - *elem = rbe; 154 - return true; 155 - } 156 - 157 - if (nft_rbtree_interval_end(rbe)) 158 - interval = NULL; 159 - 160 - parent = rcu_dereference_raw(parent->rb_left); 161 - } 159 + if (a >= 0) { 160 + if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0) 161 + return 0; 162 + else if (b < 0) 163 + return 0; 162 164 } 163 165 164 - if (set->flags & NFT_SET_INTERVAL && interval != NULL && 165 - nft_set_elem_active(&interval->ext, genmask) && 166 - !nft_set_elem_expired(&interval->ext) && 167 - ((!nft_rbtree_interval_end(interval) && 168 - !(flags & NFT_SET_ELEM_INTERVAL_END)) || 169 - (nft_rbtree_interval_end(interval) && 170 - (flags & NFT_SET_ELEM_INTERVAL_END)))) { 171 - *elem = interval; 172 - return true; 173 - } 166 + if (a < 0) 167 + return -1; 174 168 175 - return false; 169 + return 1; 176 170 } 177 171 178 172 static struct nft_elem_priv * ··· 154 200 const struct nft_set_elem *elem, unsigned int flags) 155 201 { 156 202 struct nft_rbtree *priv = nft_set_priv(set); 157 - unsigned int seq = read_seqcount_begin(&priv->count); 158 - struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); 159 - const u32 *key = (const u32 *)&elem->key.val; 160 - u8 genmask = nft_genmask_cur(net); 161 - bool ret; 203 + struct nft_array *array = rcu_dereference(priv->array); 204 + const struct nft_array_interval *interval; 205 + struct nft_array_get_ctx ctx = { 206 + .key = (const u32 *)&elem->key.val, 207 + .flags = flags, 208 + .klen = set->klen, 209 + }; 210 + struct nft_rbtree_elem *rbe; 162 211 163 - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); 164 - if (ret || !read_seqcount_retry(&priv->count, seq)) 165 - return &rbe->priv; 166 - 167 - read_lock_bh(&priv->lock); 168 - seq = read_seqcount_begin(&priv->count); 169 - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); 170 - read_unlock_bh(&priv->lock); 171 - 172 - if (!ret) 212 + if (!array) 173 213 return ERR_PTR(-ENOENT); 214 + 215 + interval = bsearch(&ctx, array->intervals, array->num_intervals, 216 + sizeof(struct nft_array_interval), nft_array_get_cmp); 217 + if (!interval || nft_set_elem_expired(interval->from)) 218 + return ERR_PTR(-ENOENT); 219 + 220 + if (flags & NFT_SET_ELEM_INTERVAL_END) 221 + rbe = container_of(interval->to, struct nft_rbtree_elem, ext); 222 + else 223 + rbe = container_of(interval->from, struct nft_rbtree_elem, ext); 174 224 175 225 return &rbe->priv; 176 226 } ··· 439 481 return 0; 440 482 } 441 483 484 + static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals) 485 + { 486 + struct nft_array_interval *intervals; 487 + 488 + intervals = kvcalloc(max_intervals, sizeof(struct nft_array_interval), 489 + GFP_KERNEL_ACCOUNT); 490 + if (!intervals) 491 + return -ENOMEM; 492 + 493 + if (array->intervals) 494 + kvfree(array->intervals); 495 + 496 + array->intervals = intervals; 497 + array->max_intervals = max_intervals; 498 + 499 + return 0; 500 + } 501 + 502 + static struct nft_array *nft_array_alloc(u32 max_intervals) 503 + { 504 + struct nft_array *array; 505 + 506 + array = kzalloc(sizeof(*array), GFP_KERNEL_ACCOUNT); 507 + if (!array) 508 + return NULL; 509 + 510 + if (nft_array_intervals_alloc(array, max_intervals) < 0) { 511 + kfree(array); 512 + return NULL; 513 + } 514 + 515 + return array; 516 + } 517 + 518 + #define NFT_ARRAY_EXTRA_SIZE 10240 519 + 520 + /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider 521 + * packed representation coming from userspace for anonymous sets too. 522 + */ 523 + static u32 nft_array_elems(const struct nft_set *set) 524 + { 525 + u32 nelems = atomic_read(&set->nelems); 526 + 527 + /* Adjacent intervals are represented with a single start element in 528 + * anonymous sets, use the current element counter as is. 529 + */ 530 + if (nft_set_is_anonymous(set)) 531 + return nelems; 532 + 533 + /* Add extra room for never matching interval at the beginning and open 534 + * interval at the end which only use a single element to represent it. 535 + * The conversion to array will compact intervals, this allows reduce 536 + * memory consumption. 537 + */ 538 + return (nelems / 2) + 2; 539 + } 540 + 541 + static int nft_array_may_resize(const struct nft_set *set) 542 + { 543 + u32 nelems = nft_array_elems(set), new_max_intervals; 544 + struct nft_rbtree *priv = nft_set_priv(set); 545 + struct nft_array *array; 546 + 547 + if (!priv->array_next) { 548 + array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE); 549 + if (!array) 550 + return -ENOMEM; 551 + 552 + priv->array_next = array; 553 + } 554 + 555 + if (nelems < priv->array_next->max_intervals) 556 + return 0; 557 + 558 + new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE; 559 + if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) 560 + return -ENOMEM; 561 + 562 + return 0; 563 + } 564 + 442 565 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, 443 566 const struct nft_set_elem *elem, 444 567 struct nft_elem_priv **elem_priv) ··· 528 489 struct nft_rbtree *priv = nft_set_priv(set); 529 490 int err; 530 491 492 + if (nft_array_may_resize(set) < 0) 493 + return -ENOMEM; 494 + 531 495 do { 532 496 if (fatal_signal_pending(current)) 533 497 return -EINTR; ··· 538 496 cond_resched(); 539 497 540 498 write_lock_bh(&priv->lock); 541 - write_seqcount_begin(&priv->count); 542 499 err = __nft_rbtree_insert(net, set, rbe, elem_priv); 543 - write_seqcount_end(&priv->count); 544 500 write_unlock_bh(&priv->lock); 545 501 } while (err == -EAGAIN); 546 502 ··· 548 508 static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) 549 509 { 550 510 write_lock_bh(&priv->lock); 551 - write_seqcount_begin(&priv->count); 552 511 rb_erase(&rbe->node, &priv->root); 553 - write_seqcount_end(&priv->count); 554 512 write_unlock_bh(&priv->lock); 555 513 } 556 514 ··· 590 552 u8 genmask = nft_genmask_next(net); 591 553 u64 tstamp = nft_net_tstamp(net); 592 554 int d; 555 + 556 + if (nft_array_may_resize(set) < 0) 557 + return NULL; 593 558 594 559 while (parent != NULL) { 595 560 rbe = rb_entry(parent, struct nft_rbtree_elem, node); ··· 656 615 switch (iter->type) { 657 616 case NFT_ITER_UPDATE: 658 617 lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); 618 + 619 + if (nft_array_may_resize(set) < 0) { 620 + iter->err = -ENOMEM; 621 + break; 622 + } 659 623 nft_rbtree_do_walk(ctx, set, iter); 660 624 break; 661 625 case NFT_ITER_READ: ··· 760 714 BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); 761 715 762 716 rwlock_init(&priv->lock); 763 - seqcount_rwlock_init(&priv->count, &priv->lock); 764 717 priv->root = RB_ROOT; 765 718 719 + priv->array = NULL; 720 + priv->array_next = NULL; 721 + 766 722 return 0; 723 + } 724 + 725 + static void __nft_array_free(struct nft_array *array) 726 + { 727 + kvfree(array->intervals); 728 + kfree(array); 767 729 } 768 730 769 731 static void nft_rbtree_destroy(const struct nft_ctx *ctx, ··· 779 725 { 780 726 struct nft_rbtree *priv = nft_set_priv(set); 781 727 struct nft_rbtree_elem *rbe; 728 + struct nft_array *array; 782 729 struct rb_node *node; 783 730 784 731 while ((node = priv->root.rb_node) != NULL) { ··· 787 732 rbe = rb_entry(node, struct nft_rbtree_elem, node); 788 733 nf_tables_set_elem_destroy(ctx, set, &rbe->priv); 789 734 } 735 + 736 + array = rcu_dereference_protected(priv->array, true); 737 + if (array) 738 + __nft_array_free(array); 739 + if (priv->array_next) 740 + __nft_array_free(priv->array_next); 790 741 } 791 742 792 743 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, ··· 813 752 return true; 814 753 } 815 754 755 + static void nft_array_free_rcu(struct rcu_head *rcu_head) 756 + { 757 + struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head); 758 + 759 + __nft_array_free(array); 760 + } 761 + 816 762 static void nft_rbtree_commit(struct nft_set *set) 817 763 { 818 764 struct nft_rbtree *priv = nft_set_priv(set); 765 + struct nft_rbtree_elem *rbe, *prev_rbe; 766 + struct nft_array *old; 767 + u32 num_intervals = 0; 768 + struct rb_node *node; 819 769 820 770 if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) 821 771 nft_rbtree_gc(set); 772 + 773 + /* No changes, skip, eg. elements updates only. */ 774 + if (!priv->array_next) 775 + return; 776 + 777 + /* Reverse walk to create an array from smaller to largest interval. */ 778 + node = rb_last(&priv->root); 779 + if (node) 780 + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); 781 + else 782 + prev_rbe = NULL; 783 + 784 + while (prev_rbe) { 785 + rbe = prev_rbe; 786 + 787 + if (nft_rbtree_interval_start(rbe)) 788 + priv->array_next->intervals[num_intervals].from = &rbe->ext; 789 + else if (nft_rbtree_interval_end(rbe)) 790 + priv->array_next->intervals[num_intervals++].to = &rbe->ext; 791 + 792 + if (num_intervals >= priv->array_next->max_intervals) { 793 + pr_warn_once("malformed interval set from userspace?"); 794 + goto err_out; 795 + } 796 + 797 + node = rb_prev(node); 798 + if (!node) 799 + break; 800 + 801 + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); 802 + 803 + /* For anonymous sets, when adjacent ranges are found, 804 + * the end element is not added to the set to pack the set 805 + * representation. Use next start element to complete this 806 + * interval. 807 + */ 808 + if (nft_rbtree_interval_start(rbe) && 809 + nft_rbtree_interval_start(prev_rbe) && 810 + priv->array_next->intervals[num_intervals].from) 811 + priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext; 812 + 813 + if (num_intervals >= priv->array_next->max_intervals) { 814 + pr_warn_once("malformed interval set from userspace?"); 815 + goto err_out; 816 + } 817 + } 818 + 819 + if (priv->array_next->intervals[num_intervals].from) 820 + num_intervals++; 821 + err_out: 822 + priv->array_next->num_intervals = num_intervals; 823 + old = rcu_replace_pointer(priv->array, priv->array_next, true); 824 + priv->array_next = NULL; 825 + if (old) 826 + call_rcu(&old->rcu_head, nft_array_free_rcu); 827 + } 828 + 829 + static void nft_rbtree_abort(const struct nft_set *set) 830 + { 831 + struct nft_rbtree *priv = nft_set_priv(set); 832 + struct nft_array *array_next; 833 + 834 + if (!priv->array_next) 835 + return; 836 + 837 + array_next = priv->array_next; 838 + priv->array_next = NULL; 839 + __nft_array_free(array_next); 822 840 } 823 841 824 842 static void nft_rbtree_gc_init(const struct nft_set *set) ··· 961 821 .flush = nft_rbtree_flush, 962 822 .activate = nft_rbtree_activate, 963 823 .commit = nft_rbtree_commit, 824 + .abort = nft_rbtree_abort, 964 825 .gc_init = nft_rbtree_gc_init, 965 826 .lookup = nft_rbtree_lookup, 966 827 .walk = nft_rbtree_walk,