Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

netfilter: nft_set_rbtree: don't gc elements on insert

During insertion we can queue up expired elements for garbage
collection.

In case of later abort, the commit hook will never be called.
Packet path and 'get' requests will find free'd elements in the
binary search blob:

nft_set_ext_key include/net/netfilter/nf_tables.h:800 [inline]
nft_array_get_cmp+0x1f6/0x2a0 net/netfilter/nft_set_rbtree.c:133
__inline_bsearch include/linux/bsearch.h:15 [inline]
bsearch+0x50/0xc0 lib/bsearch.c:33
nft_rbtree_get+0x16b/0x400 net/netfilter/nft_set_rbtree.c:169
nft_setelem_get net/netfilter/nf_tables_api.c:6495 [inline]
nft_get_set_elem+0x420/0xaa0 net/netfilter/nf_tables_api.c:6543
nf_tables_getsetelem+0x448/0x5e0 net/netfilter/nf_tables_api.c:6632
nfnetlink_rcv_msg+0x8ae/0x12c0 net/netfilter/nfnetlink.c:290

Also, when we insert an element that triggers -EEXIST, and that insertion
happens to also zap a timed-out entry, we end up with same issue:
Neither commit nor abort hook is called.

Fix this by removing gc api usage during insertion.

The blamed commit also removes concurrency of the rbtree with the
packet path, so we can now safely rb_erase() the element and move
it to a new expired list that can be reaped in the commit hook
before building the next blob iteration.

This also avoids the need to rebuild the blob in the abort path:
Expired elements seen during insertion attempts are kept around
until a transaction passes.

Reported-by: syzbot+d417922a3e7935517ef6@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d417922a3e7935517ef6
Fixes: 7e43e0a1141d ("netfilter: nft_set_rbtree: translate rbtree to array for binary search")
Signed-off-by: Florian Westphal <fw@strlen.de>

+68 -68
+68 -68
net/netfilter/nft_set_rbtree.c
··· 34 34 struct nft_array __rcu *array; 35 35 struct nft_array *array_next; 36 36 unsigned long last_gc; 37 + struct list_head expired; 37 38 }; 38 39 39 40 struct nft_rbtree_elem { 40 41 struct nft_elem_priv priv; 41 - struct rb_node node; 42 + union { 43 + struct rb_node node; 44 + struct list_head list; 45 + }; 42 46 struct nft_set_ext ext; 43 47 }; 44 48 ··· 183 179 return &rbe->priv; 184 180 } 185 181 186 - static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, 187 - struct nft_rbtree *priv, 188 - struct nft_rbtree_elem *rbe) 182 + static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set, 183 + struct nft_rbtree *priv, 184 + struct nft_rbtree_elem *rbe) 189 185 { 190 186 lockdep_assert_held_write(&priv->lock); 191 187 nft_setelem_data_deactivate(net, set, &rbe->priv); 192 188 rb_erase(&rbe->node, &priv->root); 189 + 190 + /* collected later on in commit callback */ 191 + list_add(&rbe->list, &priv->expired); 193 192 } 194 193 195 194 static const struct nft_rbtree_elem * ··· 203 196 struct rb_node *prev = rb_prev(&rbe->node); 204 197 struct net *net = read_pnet(&set->net); 205 198 struct nft_rbtree_elem *rbe_prev; 206 - struct nft_trans_gc *gc; 207 - 208 - gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); 209 - if (!gc) 210 - return ERR_PTR(-ENOMEM); 211 199 212 200 /* search for end interval coming before this element. 213 201 * end intervals don't carry a timeout extension, they ··· 220 218 rbe_prev = NULL; 221 219 if (prev) { 222 220 rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); 223 - nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); 224 - 225 - /* There is always room in this trans gc for this element, 226 - * memory allocation never actually happens, hence, the warning 227 - * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, 228 - * this is synchronous gc which never fails. 229 - */ 230 - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); 231 - if (WARN_ON_ONCE(!gc)) 232 - return ERR_PTR(-ENOMEM); 233 - 234 - nft_trans_gc_elem_add(gc, rbe_prev); 221 + nft_rbtree_gc_elem_move(net, set, priv, rbe_prev); 235 222 } 236 223 237 - nft_rbtree_gc_elem_remove(net, set, priv, rbe); 238 - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); 239 - if (WARN_ON_ONCE(!gc)) 240 - return ERR_PTR(-ENOMEM); 241 - 242 - nft_trans_gc_elem_add(gc, rbe); 243 - 244 - nft_trans_gc_queue_sync_done(gc); 224 + nft_rbtree_gc_elem_move(net, set, priv, rbe); 245 225 246 226 return rbe_prev; 247 227 } ··· 659 675 } 660 676 } 661 677 662 - static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, 663 - struct nft_rbtree *priv, 664 - struct nft_rbtree_elem *rbe) 665 - { 666 - nft_setelem_data_deactivate(net, set, &rbe->priv); 667 - nft_rbtree_erase(priv, rbe); 668 - } 669 - 670 - static void nft_rbtree_gc(struct nft_set *set) 678 + static void nft_rbtree_gc_scan(struct nft_set *set) 671 679 { 672 680 struct nft_rbtree *priv = nft_set_priv(set); 673 681 struct nft_rbtree_elem *rbe, *rbe_end = NULL; 674 682 struct net *net = read_pnet(&set->net); 675 683 u64 tstamp = nft_net_tstamp(net); 676 684 struct rb_node *node, *next; 677 - struct nft_trans_gc *gc; 678 - 679 - set = nft_set_container_of(priv); 680 - net = read_pnet(&set->net); 681 - 682 - gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); 683 - if (!gc) 684 - return; 685 685 686 686 for (node = rb_first(&priv->root); node ; node = next) { 687 687 next = rb_next(node); ··· 683 715 if (!__nft_set_elem_expired(&rbe->ext, tstamp)) 684 716 continue; 685 717 686 - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); 687 - if (!gc) 688 - goto try_later; 689 - 690 718 /* end element needs to be removed first, it has 691 719 * no timeout extension. 692 720 */ 721 + write_lock_bh(&priv->lock); 693 722 if (rbe_end) { 694 - nft_rbtree_gc_remove(net, set, priv, rbe_end); 695 - nft_trans_gc_elem_add(gc, rbe_end); 723 + nft_rbtree_gc_elem_move(net, set, priv, rbe_end); 696 724 rbe_end = NULL; 697 725 } 698 726 727 + nft_rbtree_gc_elem_move(net, set, priv, rbe); 728 + write_unlock_bh(&priv->lock); 729 + } 730 + 731 + priv->last_gc = jiffies; 732 + } 733 + 734 + static void nft_rbtree_gc_queue(struct nft_set *set) 735 + { 736 + struct nft_rbtree *priv = nft_set_priv(set); 737 + struct nft_rbtree_elem *rbe, *rbe_end; 738 + struct nft_trans_gc *gc; 739 + 740 + if (list_empty(&priv->expired)) 741 + return; 742 + 743 + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); 744 + if (!gc) 745 + return; 746 + 747 + list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) { 748 + list_del(&rbe->list); 749 + nft_trans_gc_elem_add(gc, rbe); 750 + 699 751 gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); 700 752 if (!gc) 701 - goto try_later; 702 - 703 - nft_rbtree_gc_remove(net, set, priv, rbe); 704 - nft_trans_gc_elem_add(gc, rbe); 753 + return; 705 754 } 706 755 707 - try_later: 708 - 709 - if (gc) { 710 - gc = nft_trans_gc_catchall_sync(gc); 711 - nft_trans_gc_queue_sync_done(gc); 712 - priv->last_gc = jiffies; 713 - } 756 + gc = nft_trans_gc_catchall_sync(gc); 757 + nft_trans_gc_queue_sync_done(gc); 714 758 } 715 759 716 760 static u64 nft_rbtree_privsize(const struct nlattr * const nla[], ··· 741 761 742 762 rwlock_init(&priv->lock); 743 763 priv->root = RB_ROOT; 764 + INIT_LIST_HEAD(&priv->expired); 744 765 745 766 priv->array = NULL; 746 767 priv->array_next = NULL; ··· 759 778 const struct nft_set *set) 760 779 { 761 780 struct nft_rbtree *priv = nft_set_priv(set); 762 - struct nft_rbtree_elem *rbe; 781 + struct nft_rbtree_elem *rbe, *next; 763 782 struct nft_array *array; 764 783 struct rb_node *node; 784 + 785 + list_for_each_entry_safe(rbe, next, &priv->expired, list) { 786 + list_del(&rbe->list); 787 + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); 788 + } 765 789 766 790 while ((node = priv->root.rb_node) != NULL) { 767 791 rb_erase(node, &priv->root); ··· 814 828 u32 num_intervals = 0; 815 829 struct rb_node *node; 816 830 817 - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) 818 - nft_rbtree_gc(set); 819 - 820 831 /* No changes, skip, eg. elements updates only. */ 821 832 if (!priv->array_next) 822 833 return; 834 + 835 + /* GC can be performed if the binary search blob is going 836 + * to be rebuilt. It has to be done in two phases: first 837 + * scan tree and move all expired elements to the expired 838 + * list. 839 + * 840 + * Then, after blob has been re-built and published to other 841 + * CPUs, queue collected entries for freeing. 842 + */ 843 + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) 844 + nft_rbtree_gc_scan(set); 823 845 824 846 /* Reverse walk to create an array from smaller to largest interval. */ 825 847 node = rb_last(&priv->root); ··· 875 881 num_intervals++; 876 882 err_out: 877 883 priv->array_next->num_intervals = num_intervals; 878 - old = rcu_replace_pointer(priv->array, priv->array_next, true); 884 + old = rcu_replace_pointer(priv->array, priv->array_next, 885 + lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex)); 879 886 priv->array_next = NULL; 880 887 if (old) 881 888 call_rcu(&old->rcu_head, nft_array_free_rcu); 889 + 890 + /* New blob is public, queue collected entries for freeing. 891 + * call_rcu ensures elements stay around until readers are done. 892 + */ 893 + nft_rbtree_gc_queue(set); 882 894 } 883 895 884 896 static void nft_rbtree_abort(const struct nft_set *set)