Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'nf-next-26-02-06' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Florian Westphal says:

====================
netfilter: updates for net-next

The following patchset contains Netfilter updates for *net-next*:

1) Fix net-next-only use-after-free bug in nf_tables rbtree set:
Expired elements cannot be released right away after unlink anymore
because there is no guarantee that the binary-search blob is going to
be updated. Spotted by syzkaller.

2) Fix esoteric bug in nf_queue with udp fraglist gro, broken since
6.11. Patch 3 adds extends the nfqueue selftest for this.

4) Use dedicated slab for flowtable entries, currently the -512 cache
is used, which is wasteful. From Qingfang Deng.

5) Recent net-next update extended existing test for ip6ip6 tunnels, add
the required /config entry. Test still passed by accident because the
previous tests network setup gets re-used, so also update the test so
it will fail in case the ip6ip6 tunnel interface cannot be added.

6) Fix 'nft get element mytable myset { 1.2.3.4 }' on big endian
platforms, this was broken since code was added in v5.1.

7) Fix nf_tables counter reset support on 32bit platforms, where counter
reset may cause huge values to appear due to wraparound.
Broken since reset feature was added in v6.11. From Anders Grahn.

8-11) update nf_tables rbtree set type to detect partial
operlaps. This will eventually speed up nftables userspace: at this
time userspace does a netlink dump of the set content which slows down
incremental updates on interval sets. From Pablo Neira Ayuso.

* tag 'nf-next-26-02-06' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
netfilter: nft_set_rbtree: validate open interval overlap
netfilter: nft_set_rbtree: validate element belonging to interval
netfilter: nft_set_rbtree: check for partial overlaps in anonymous sets
netfilter: nft_set_rbtree: fix bogus EEXIST with NLM_F_CREATE with null interval
netfilter: nft_counter: fix reset of counters on 32bit archs
netfilter: nft_set_hash: fix get operation on big endian
selftests: netfilter: add IPV6_TUNNEL to config
netfilter: flowtable: dedicated slab for flow entry
selftests: netfilter: nft_queue.sh: add udp fraglist gro test case
netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation
netfilter: nft_set_rbtree: don't gc elements on insert
====================

Link: https://patch.msgid.link/20260206153048.17570-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+580 -148
+10
include/linux/u64_stats_sync.h
··· 97 97 local64_add(val, &p->v); 98 98 } 99 99 100 + static inline void u64_stats_sub(u64_stats_t *p, s64 val) 101 + { 102 + local64_sub(val, &p->v); 103 + } 104 + 100 105 static inline void u64_stats_inc(u64_stats_t *p) 101 106 { 102 107 local64_inc(&p->v); ··· 148 143 static inline void u64_stats_add(u64_stats_t *p, unsigned long val) 149 144 { 150 145 p->v += val; 146 + } 147 + 148 + static inline void u64_stats_sub(u64_stats_t *p, s64 val) 149 + { 150 + p->v -= val; 151 151 } 152 152 153 153 static inline void u64_stats_inc(u64_stats_t *p)
+1
include/net/netfilter/nf_queue.h
··· 21 21 struct net_device *physout; 22 22 #endif 23 23 struct nf_hook_state state; 24 + bool nf_ct_is_unconfirmed; 24 25 u16 size; /* sizeof(entry) + saved route keys */ 25 26 u16 queue_num; 26 27
+4
include/net/netfilter/nf_tables.h
··· 277 277 unsigned char data[]; 278 278 }; 279 279 280 + #define NFT_SET_ELEM_INTERNAL_LAST 0x1 281 + 280 282 /* placeholder structure for opaque set element backend representation. */ 281 283 struct nft_elem_priv { }; 282 284 ··· 288 286 * @key: element key 289 287 * @key_end: closing element key 290 288 * @data: element data 289 + * @flags: flags 291 290 * @priv: element private data and extensions 292 291 */ 293 292 struct nft_set_elem { ··· 304 301 u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; 305 302 struct nft_data val; 306 303 } data; 304 + u32 flags; 307 305 struct nft_elem_priv *priv; 308 306 }; 309 307
+10 -2
net/netfilter/nf_flow_table_core.c
··· 16 16 17 17 static DEFINE_MUTEX(flowtable_lock); 18 18 static LIST_HEAD(flowtables); 19 + static __read_mostly struct kmem_cache *flow_offload_cachep; 19 20 20 21 static void 21 22 flow_offload_fill_dir(struct flow_offload *flow, ··· 57 56 if (unlikely(nf_ct_is_dying(ct))) 58 57 return NULL; 59 58 60 - flow = kzalloc(sizeof(*flow), GFP_ATOMIC); 59 + flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC); 61 60 if (!flow) 62 61 return NULL; 63 62 ··· 813 812 { 814 813 int ret; 815 814 815 + flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN); 816 + if (!flow_offload_cachep) 817 + return -ENOMEM; 818 + 816 819 ret = register_pernet_subsys(&nf_flow_table_net_ops); 817 820 if (ret < 0) 818 - return ret; 821 + goto out_pernet; 819 822 820 823 ret = nf_flow_table_offload_init(); 821 824 if (ret) ··· 835 830 nf_flow_table_offload_exit(); 836 831 out_offload: 837 832 unregister_pernet_subsys(&nf_flow_table_net_ops); 833 + out_pernet: 834 + kmem_cache_destroy(flow_offload_cachep); 838 835 return ret; 839 836 } 840 837 ··· 844 837 { 845 838 nf_flow_table_offload_exit(); 846 839 unregister_pernet_subsys(&nf_flow_table_net_ops); 840 + kmem_cache_destroy(flow_offload_cachep); 847 841 } 848 842 849 843 module_init(nf_flow_table_module_init);
+22 -4
net/netfilter/nf_tables_api.c
··· 7270 7270 } 7271 7271 7272 7272 static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, 7273 - const struct nlattr *attr, u32 nlmsg_flags) 7273 + const struct nlattr *attr, u32 nlmsg_flags, 7274 + bool last) 7274 7275 { 7275 7276 struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; 7276 7277 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; ··· 7557 7556 if (flags) 7558 7557 *nft_set_ext_flags(ext) = flags; 7559 7558 7559 + if (last) 7560 + elem.flags = NFT_SET_ELEM_INTERNAL_LAST; 7561 + else 7562 + elem.flags = 0; 7563 + 7560 7564 if (obj) 7561 7565 *nft_set_ext_obj(ext) = obj; 7562 7566 ··· 7642 7636 * and an existing one. 7643 7637 */ 7644 7638 err = -EEXIST; 7639 + } else if (err == -ECANCELED) { 7640 + /* ECANCELED reports an existing nul-element in 7641 + * interval sets. 7642 + */ 7643 + err = 0; 7645 7644 } 7646 7645 goto err_element_clash; 7647 7646 } ··· 7725 7714 nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); 7726 7715 7727 7716 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { 7728 - err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); 7717 + err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags, 7718 + nla_is_last(attr, rem)); 7729 7719 if (err < 0) { 7730 7720 NL_SET_BAD_ATTR(extack, attr); 7731 7721 return err; ··· 7850 7838 } 7851 7839 7852 7840 static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, 7853 - const struct nlattr *attr) 7841 + const struct nlattr *attr, bool last) 7854 7842 { 7855 7843 struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; 7856 7844 struct nft_set_ext_tmpl tmpl; ··· 7917 7905 ext = nft_set_elem_ext(set, elem.priv); 7918 7906 if (flags) 7919 7907 *nft_set_ext_flags(ext) = flags; 7908 + 7909 + if (last) 7910 + elem.flags = NFT_SET_ELEM_INTERNAL_LAST; 7911 + else 7912 + elem.flags = 0; 7920 7913 7921 7914 trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); 7922 7915 if (trans == NULL) ··· 8070 8053 return nft_set_flush(&ctx, set, genmask); 8071 8054 8072 8055 nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { 8073 - err = nft_del_setelem(&ctx, set, attr); 8056 + err = nft_del_setelem(&ctx, set, attr, 8057 + nla_is_last(attr, rem)); 8074 8058 if (err == -ENOENT && 8075 8059 NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) 8076 8060 continue;
+2 -2
net/netfilter/nft_counter.c
··· 117 117 nft_sync = this_cpu_ptr(&nft_counter_sync); 118 118 119 119 u64_stats_update_begin(nft_sync); 120 - u64_stats_add(&this_cpu->packets, -total->packets); 121 - u64_stats_add(&this_cpu->bytes, -total->bytes); 120 + u64_stats_sub(&this_cpu->packets, total->packets); 121 + u64_stats_sub(&this_cpu->bytes, total->bytes); 122 122 u64_stats_update_end(nft_sync); 123 123 124 124 local_bh_enable();
+7 -2
net/netfilter/nft_set_hash.c
··· 619 619 nft_hash_get(const struct net *net, const struct nft_set *set, 620 620 const struct nft_set_elem *elem, unsigned int flags) 621 621 { 622 + const u32 *key = (const u32 *)&elem->key.val; 622 623 struct nft_hash *priv = nft_set_priv(set); 623 624 u8 genmask = nft_genmask_cur(net); 624 625 struct nft_hash_elem *he; 625 626 u32 hash; 626 627 627 - hash = jhash(elem->key.val.data, set->klen, priv->seed); 628 + if (set->klen == 4) 629 + hash = jhash_1word(*key, priv->seed); 630 + else 631 + hash = jhash(key, set->klen, priv->seed); 632 + 628 633 hash = reciprocal_scale(hash, priv->buckets); 629 634 hlist_for_each_entry_rcu(he, &priv->table[hash], node) { 630 - if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && 635 + if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && 631 636 nft_set_elem_active(&he->ext, genmask)) 632 637 return &he->priv; 633 638 }
+300 -77
net/netfilter/nft_set_rbtree.c
··· 33 33 rwlock_t lock; 34 34 struct nft_array __rcu *array; 35 35 struct nft_array *array_next; 36 + unsigned long start_rbe_cookie; 36 37 unsigned long last_gc; 38 + struct list_head expired; 39 + u64 last_tstamp; 37 40 }; 38 41 39 42 struct nft_rbtree_elem { 40 43 struct nft_elem_priv priv; 41 - struct rb_node node; 44 + union { 45 + struct rb_node node; 46 + struct list_head list; 47 + }; 42 48 struct nft_set_ext ext; 43 49 }; 44 50 ··· 57 51 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) 58 52 { 59 53 return !nft_rbtree_interval_end(rbe); 54 + } 55 + 56 + static bool nft_rbtree_interval_null(const struct nft_set *set, 57 + const struct nft_rbtree_elem *rbe) 58 + { 59 + return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) && 60 + nft_rbtree_interval_end(rbe)); 60 61 } 61 62 62 63 static int nft_rbtree_cmp(const struct nft_set *set, ··· 192 179 return &rbe->priv; 193 180 } 194 181 195 - static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, 196 - struct nft_rbtree *priv, 197 - struct nft_rbtree_elem *rbe) 182 + static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set, 183 + struct nft_rbtree *priv, 184 + struct nft_rbtree_elem *rbe) 198 185 { 199 186 lockdep_assert_held_write(&priv->lock); 200 187 nft_setelem_data_deactivate(net, set, &rbe->priv); 201 188 rb_erase(&rbe->node, &priv->root); 189 + 190 + /* collected later on in commit callback */ 191 + list_add(&rbe->list, &priv->expired); 202 192 } 203 193 204 194 static const struct nft_rbtree_elem * ··· 212 196 struct rb_node *prev = rb_prev(&rbe->node); 213 197 struct net *net = read_pnet(&set->net); 214 198 struct nft_rbtree_elem *rbe_prev; 215 - struct nft_trans_gc *gc; 216 - 217 - gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); 218 - if (!gc) 219 - return ERR_PTR(-ENOMEM); 220 199 221 200 /* search for end interval coming before this element. 222 201 * end intervals don't carry a timeout extension, they ··· 229 218 rbe_prev = NULL; 230 219 if (prev) { 231 220 rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); 232 - nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); 233 - 234 - /* There is always room in this trans gc for this element, 235 - * memory allocation never actually happens, hence, the warning 236 - * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, 237 - * this is synchronous gc which never fails. 238 - */ 239 - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); 240 - if (WARN_ON_ONCE(!gc)) 241 - return ERR_PTR(-ENOMEM); 242 - 243 - nft_trans_gc_elem_add(gc, rbe_prev); 221 + nft_rbtree_gc_elem_move(net, set, priv, rbe_prev); 244 222 } 245 223 246 - nft_rbtree_gc_elem_remove(net, set, priv, rbe); 247 - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); 248 - if (WARN_ON_ONCE(!gc)) 249 - return ERR_PTR(-ENOMEM); 250 - 251 - nft_trans_gc_elem_add(gc, rbe); 252 - 253 - nft_trans_gc_queue_sync_done(gc); 224 + nft_rbtree_gc_elem_move(net, set, priv, rbe); 254 225 255 226 return rbe_prev; 256 227 } ··· 253 260 return false; 254 261 } 255 262 263 + /* Only for anonymous sets which do not allow updates, all element are active. */ 264 + static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe) 265 + { 266 + struct rb_node *node; 267 + 268 + node = rb_prev(&rbe->node); 269 + if (!node) 270 + return NULL; 271 + 272 + return rb_entry(node, struct nft_rbtree_elem, node); 273 + } 274 + 275 + static struct nft_rbtree_elem * 276 + __nft_rbtree_next_active(struct rb_node *node, u8 genmask) 277 + { 278 + struct nft_rbtree_elem *next_rbe; 279 + 280 + while (node) { 281 + next_rbe = rb_entry(node, struct nft_rbtree_elem, node); 282 + if (!nft_set_elem_active(&next_rbe->ext, genmask)) { 283 + node = rb_next(node); 284 + continue; 285 + } 286 + 287 + return next_rbe; 288 + } 289 + 290 + return NULL; 291 + } 292 + 293 + static struct nft_rbtree_elem * 294 + nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask) 295 + { 296 + return __nft_rbtree_next_active(rb_next(&rbe->node), genmask); 297 + } 298 + 299 + static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv, 300 + u64 tstamp) 301 + { 302 + if (priv->last_tstamp != tstamp) { 303 + priv->start_rbe_cookie = 0; 304 + priv->last_tstamp = tstamp; 305 + } 306 + } 307 + 308 + static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv, 309 + const struct nft_rbtree_elem *rbe) 310 + { 311 + priv->start_rbe_cookie = (unsigned long)rbe; 312 + } 313 + 314 + static void nft_rbtree_set_start_cookie_open(struct nft_rbtree *priv, 315 + const struct nft_rbtree_elem *rbe, 316 + unsigned long open_interval) 317 + { 318 + priv->start_rbe_cookie = (unsigned long)rbe | open_interval; 319 + } 320 + 321 + #define NFT_RBTREE_OPEN_INTERVAL 1UL 322 + 323 + static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv, 324 + const struct nft_rbtree_elem *rbe) 325 + { 326 + return (priv->start_rbe_cookie & ~NFT_RBTREE_OPEN_INTERVAL) == (unsigned long)rbe; 327 + } 328 + 329 + static bool nft_rbtree_insert_same_interval(const struct net *net, 330 + struct nft_rbtree *priv, 331 + struct nft_rbtree_elem *rbe) 332 + { 333 + u8 genmask = nft_genmask_next(net); 334 + struct nft_rbtree_elem *next_rbe; 335 + 336 + if (!priv->start_rbe_cookie) 337 + return true; 338 + 339 + next_rbe = nft_rbtree_next_active(rbe, genmask); 340 + if (next_rbe) { 341 + /* Closest start element differs from last element added. */ 342 + if (nft_rbtree_interval_start(next_rbe) && 343 + nft_rbtree_cmp_start_cookie(priv, next_rbe)) { 344 + priv->start_rbe_cookie = 0; 345 + return true; 346 + } 347 + } 348 + 349 + priv->start_rbe_cookie = 0; 350 + 351 + return false; 352 + } 353 + 256 354 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, 257 355 struct nft_rbtree_elem *new, 258 - struct nft_elem_priv **elem_priv) 356 + struct nft_elem_priv **elem_priv, u64 tstamp, bool last) 259 357 { 260 - struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; 358 + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev; 261 359 struct rb_node *node, *next, *parent, **p, *first = NULL; 262 360 struct nft_rbtree *priv = nft_set_priv(set); 263 361 u8 cur_genmask = nft_genmask_cur(net); 264 362 u8 genmask = nft_genmask_next(net); 265 - u64 tstamp = nft_net_tstamp(net); 363 + unsigned long open_interval = 0; 266 364 int d; 267 365 268 366 /* Descend the tree to search for an existing element greater than the ··· 459 375 } 460 376 } 461 377 378 + if (nft_rbtree_interval_null(set, new)) { 379 + priv->start_rbe_cookie = 0; 380 + } else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) { 381 + if (nft_set_is_anonymous(set)) { 382 + priv->start_rbe_cookie = 0; 383 + } else if (priv->start_rbe_cookie & NFT_RBTREE_OPEN_INTERVAL) { 384 + /* Previous element is an open interval that partially 385 + * overlaps with an existing non-open interval. 386 + */ 387 + return -ENOTEMPTY; 388 + } 389 + } 390 + 462 391 /* - new start element matching existing start element: full overlap 463 392 * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. 464 393 */ 465 394 if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && 466 395 nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { 467 396 *elem_priv = &rbe_ge->priv; 397 + 398 + /* - Corner case: new start element of open interval (which 399 + * comes as last element in the batch) overlaps the start of 400 + * an existing interval with an end element: partial overlap. 401 + */ 402 + node = rb_first(&priv->root); 403 + rbe = __nft_rbtree_next_active(node, genmask); 404 + if (rbe && nft_rbtree_interval_end(rbe)) { 405 + rbe = nft_rbtree_next_active(rbe, genmask); 406 + if (rbe && 407 + nft_rbtree_interval_start(rbe) && 408 + !nft_rbtree_cmp(set, new, rbe)) { 409 + if (last) 410 + return -ENOTEMPTY; 411 + 412 + /* Maybe open interval? */ 413 + open_interval = NFT_RBTREE_OPEN_INTERVAL; 414 + } 415 + } 416 + nft_rbtree_set_start_cookie_open(priv, rbe_ge, open_interval); 417 + 468 418 return -EEXIST; 469 419 } 470 420 ··· 507 389 */ 508 390 if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && 509 391 nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { 392 + /* - ignore null interval, otherwise NLM_F_CREATE bogusly 393 + * reports EEXIST. 394 + */ 395 + if (nft_rbtree_interval_null(set, new)) 396 + return -ECANCELED; 397 + 510 398 *elem_priv = &rbe_le->priv; 399 + 400 + /* - start and end element belong to the same interval. */ 401 + if (!nft_rbtree_insert_same_interval(net, priv, rbe_le)) 402 + return -ENOTEMPTY; 403 + 511 404 return -EEXIST; 512 405 } 513 406 514 407 /* - new start element with existing closest, less or equal key value 515 408 * being a start element: partial overlap, reported as -ENOTEMPTY. 516 409 * Anonymous sets allow for two consecutive start element since they 517 - * are constant, skip them to avoid bogus overlap reports. 410 + * are constant, but validate that this new start element does not 411 + * sit in between an existing start and end elements: partial overlap, 412 + * reported as -ENOTEMPTY. 518 413 */ 519 - if (!nft_set_is_anonymous(set) && rbe_le && 520 - nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) 521 - return -ENOTEMPTY; 414 + if (rbe_le && 415 + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) { 416 + if (!nft_set_is_anonymous(set)) 417 + return -ENOTEMPTY; 418 + 419 + rbe_prev = nft_rbtree_prev_active(rbe_le); 420 + if (rbe_prev && nft_rbtree_interval_end(rbe_prev)) 421 + return -ENOTEMPTY; 422 + } 522 423 523 424 /* - new end element with existing closest, less or equal key value 524 425 * being a end element: partial overlap, reported as -ENOTEMPTY. ··· 551 414 */ 552 415 if (rbe_ge && 553 416 nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) 417 + return -ENOTEMPTY; 418 + 419 + /* - start element overlaps an open interval but end element is new: 420 + * partial overlap, reported as -ENOEMPTY. 421 + */ 422 + if (!rbe_ge && priv->start_rbe_cookie && nft_rbtree_interval_end(new)) 554 423 return -ENOTEMPTY; 555 424 556 425 /* Accepted element: pick insertion point depending on key value */ ··· 668 525 struct nft_elem_priv **elem_priv) 669 526 { 670 527 struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); 528 + bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST); 671 529 struct nft_rbtree *priv = nft_set_priv(set); 530 + u64 tstamp = nft_net_tstamp(net); 672 531 int err; 532 + 533 + nft_rbtree_maybe_reset_start_cookie(priv, tstamp); 673 534 674 535 if (nft_array_may_resize(set) < 0) 675 536 return -ENOMEM; ··· 685 538 cond_resched(); 686 539 687 540 write_lock_bh(&priv->lock); 688 - err = __nft_rbtree_insert(net, set, rbe, elem_priv); 541 + err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp, last); 689 542 write_unlock_bh(&priv->lock); 543 + 544 + if (nft_rbtree_interval_end(rbe)) 545 + priv->start_rbe_cookie = 0; 546 + 690 547 } while (err == -EAGAIN); 691 548 692 549 return err; ··· 722 571 nft_clear(net, &rbe->ext); 723 572 } 724 573 574 + static struct nft_rbtree_elem * 575 + nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask) 576 + { 577 + struct nft_rbtree_elem *next_rbe; 578 + struct rb_node *node; 579 + 580 + node = rb_next(&rbe->node); 581 + if (node) { 582 + next_rbe = rb_entry(node, struct nft_rbtree_elem, node); 583 + if (nft_rbtree_interval_start(next_rbe) && 584 + !nft_set_elem_active(&next_rbe->ext, genmask)) 585 + return next_rbe; 586 + } 587 + 588 + return NULL; 589 + } 590 + 591 + static bool nft_rbtree_deactivate_same_interval(const struct net *net, 592 + struct nft_rbtree *priv, 593 + struct nft_rbtree_elem *rbe) 594 + { 595 + u8 genmask = nft_genmask_next(net); 596 + struct nft_rbtree_elem *next_rbe; 597 + 598 + if (!priv->start_rbe_cookie) 599 + return true; 600 + 601 + next_rbe = nft_rbtree_next_inactive(rbe, genmask); 602 + if (next_rbe) { 603 + /* Closest start element differs from last element added. */ 604 + if (nft_rbtree_interval_start(next_rbe) && 605 + nft_rbtree_cmp_start_cookie(priv, next_rbe)) { 606 + priv->start_rbe_cookie = 0; 607 + return true; 608 + } 609 + } 610 + 611 + priv->start_rbe_cookie = 0; 612 + 613 + return false; 614 + } 615 + 725 616 static void nft_rbtree_flush(const struct net *net, 726 617 const struct nft_set *set, 727 618 struct nft_elem_priv *elem_priv) ··· 778 585 const struct nft_set_elem *elem) 779 586 { 780 587 struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); 781 - const struct nft_rbtree *priv = nft_set_priv(set); 588 + bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST); 589 + struct nft_rbtree *priv = nft_set_priv(set); 782 590 const struct rb_node *parent = priv->root.rb_node; 783 591 u8 genmask = nft_genmask_next(net); 784 592 u64 tstamp = nft_net_tstamp(net); 785 593 int d; 594 + 595 + nft_rbtree_maybe_reset_start_cookie(priv, tstamp); 596 + 597 + if (nft_rbtree_interval_start(this) || 598 + nft_rbtree_interval_null(set, this)) 599 + priv->start_rbe_cookie = 0; 786 600 787 601 if (nft_array_may_resize(set) < 0) 788 602 return NULL; ··· 818 618 parent = parent->rb_left; 819 619 continue; 820 620 } 621 + 622 + if (nft_rbtree_interval_start(rbe)) { 623 + if (!last) 624 + nft_rbtree_set_start_cookie(priv, rbe); 625 + } else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe)) 626 + return NULL; 627 + 821 628 nft_rbtree_flush(net, set, &rbe->priv); 822 629 return &rbe->priv; 823 630 } ··· 882 675 } 883 676 } 884 677 885 - static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, 886 - struct nft_rbtree *priv, 887 - struct nft_rbtree_elem *rbe) 888 - { 889 - nft_setelem_data_deactivate(net, set, &rbe->priv); 890 - nft_rbtree_erase(priv, rbe); 891 - } 892 - 893 - static void nft_rbtree_gc(struct nft_set *set) 678 + static void nft_rbtree_gc_scan(struct nft_set *set) 894 679 { 895 680 struct nft_rbtree *priv = nft_set_priv(set); 896 681 struct nft_rbtree_elem *rbe, *rbe_end = NULL; 897 682 struct net *net = read_pnet(&set->net); 898 683 u64 tstamp = nft_net_tstamp(net); 899 684 struct rb_node *node, *next; 900 - struct nft_trans_gc *gc; 901 - 902 - set = nft_set_container_of(priv); 903 - net = read_pnet(&set->net); 904 - 905 - gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); 906 - if (!gc) 907 - return; 908 685 909 686 for (node = rb_first(&priv->root); node ; node = next) { 910 687 next = rb_next(node); ··· 906 715 if (!__nft_set_elem_expired(&rbe->ext, tstamp)) 907 716 continue; 908 717 909 - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); 910 - if (!gc) 911 - goto try_later; 912 - 913 718 /* end element needs to be removed first, it has 914 719 * no timeout extension. 915 720 */ 721 + write_lock_bh(&priv->lock); 916 722 if (rbe_end) { 917 - nft_rbtree_gc_remove(net, set, priv, rbe_end); 918 - nft_trans_gc_elem_add(gc, rbe_end); 723 + nft_rbtree_gc_elem_move(net, set, priv, rbe_end); 919 724 rbe_end = NULL; 920 725 } 921 726 727 + nft_rbtree_gc_elem_move(net, set, priv, rbe); 728 + write_unlock_bh(&priv->lock); 729 + } 730 + 731 + priv->last_gc = jiffies; 732 + } 733 + 734 + static void nft_rbtree_gc_queue(struct nft_set *set) 735 + { 736 + struct nft_rbtree *priv = nft_set_priv(set); 737 + struct nft_rbtree_elem *rbe, *rbe_end; 738 + struct nft_trans_gc *gc; 739 + 740 + if (list_empty(&priv->expired)) 741 + return; 742 + 743 + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); 744 + if (!gc) 745 + return; 746 + 747 + list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) { 748 + list_del(&rbe->list); 749 + nft_trans_gc_elem_add(gc, rbe); 750 + 922 751 gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); 923 752 if (!gc) 924 - goto try_later; 925 - 926 - nft_rbtree_gc_remove(net, set, priv, rbe); 927 - nft_trans_gc_elem_add(gc, rbe); 753 + return; 928 754 } 929 755 930 - try_later: 931 - 932 - if (gc) { 933 - gc = nft_trans_gc_catchall_sync(gc); 934 - nft_trans_gc_queue_sync_done(gc); 935 - priv->last_gc = jiffies; 936 - } 756 + gc = nft_trans_gc_catchall_sync(gc); 757 + nft_trans_gc_queue_sync_done(gc); 937 758 } 938 759 939 760 static u64 nft_rbtree_privsize(const struct nlattr * const nla[], ··· 964 761 965 762 rwlock_init(&priv->lock); 966 763 priv->root = RB_ROOT; 764 + INIT_LIST_HEAD(&priv->expired); 967 765 968 766 priv->array = NULL; 969 767 priv->array_next = NULL; ··· 982 778 const struct nft_set *set) 983 779 { 984 780 struct nft_rbtree *priv = nft_set_priv(set); 985 - struct nft_rbtree_elem *rbe; 781 + struct nft_rbtree_elem *rbe, *next; 986 782 struct nft_array *array; 987 783 struct rb_node *node; 784 + 785 + list_for_each_entry_safe(rbe, next, &priv->expired, list) { 786 + list_del(&rbe->list); 787 + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); 788 + } 988 789 989 790 while ((node = priv->root.rb_node) != NULL) { 990 791 rb_erase(node, &priv->root); ··· 1037 828 u32 num_intervals = 0; 1038 829 struct rb_node *node; 1039 830 1040 - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) 1041 - nft_rbtree_gc(set); 1042 - 1043 831 /* No changes, skip, eg. elements updates only. */ 1044 832 if (!priv->array_next) 1045 833 return; 834 + 835 + /* GC can be performed if the binary search blob is going 836 + * to be rebuilt. It has to be done in two phases: first 837 + * scan tree and move all expired elements to the expired 838 + * list. 839 + * 840 + * Then, after blob has been re-built and published to other 841 + * CPUs, queue collected entries for freeing. 842 + */ 843 + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) 844 + nft_rbtree_gc_scan(set); 1046 845 1047 846 /* Reverse walk to create an array from smaller to largest interval. */ 1048 847 node = rb_last(&priv->root); ··· 1098 881 num_intervals++; 1099 882 err_out: 1100 883 priv->array_next->num_intervals = num_intervals; 1101 - old = rcu_replace_pointer(priv->array, priv->array_next, true); 884 + old = rcu_replace_pointer(priv->array, priv->array_next, 885 + lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex)); 1102 886 priv->array_next = NULL; 1103 887 if (old) 1104 888 call_rcu(&old->rcu_head, nft_array_free_rcu); 889 + 890 + /* New blob is public, queue collected entries for freeing. 891 + * call_rcu ensures elements stay around until readers are done. 892 + */ 893 + nft_rbtree_gc_queue(set); 1105 894 } 1106 895 1107 896 static void nft_rbtree_abort(const struct nft_set *set)
+1
tools/testing/selftests/net/netfilter/config
··· 29 29 CONFIG_IP_SCTP=m 30 30 CONFIG_IPV6=y 31 31 CONFIG_IPV6_MULTIPLE_TABLES=y 32 + CONFIG_IPV6_TUNNEL=m 32 33 CONFIG_IP_VS=m 33 34 CONFIG_IP_VS_PROTO_TCP=y 34 35 CONFIG_IP_VS_RR=m
+13 -6
tools/testing/selftests/net/netfilter/nft_flowtable.sh
··· 601 601 ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 602 602 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null 603 603 604 - ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 604 + ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 || ret=1 605 605 ip -net "$nsr2" link set tun6 up 606 606 ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad 607 607 608 608 ip -net "$nsr1" route change default via 192.168.100.2 609 609 ip -net "$nsr2" route change default via 192.168.100.1 610 - ip -6 -net "$nsr1" route change default via fee1:3::2 611 - ip -6 -net "$nsr2" route change default via fee1:3::1 610 + 611 + # do not use "route change" and delete old default so 612 + # socat fails to connect in case new default can't be added. 613 + ip -6 -net "$nsr1" route delete default 614 + ip -6 -net "$nsr1" route add default via fee1:3::2 615 + ip -6 -net "$nsr2" route delete default 616 + ip -6 -net "$nsr2" route add default via fee1:3::1 612 617 ip -net "$ns2" route add default via 10.0.2.1 613 618 ip -6 -net "$ns2" route add default via dead:2::1 614 619 ··· 654 649 ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 655 650 ip -net "$nsr1" link set tun6.10 up 656 651 ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad 657 - ip -6 -net "$nsr1" route change default via fee1:5::2 652 + ip -6 -net "$nsr1" route delete default 653 + ip -6 -net "$nsr1" route add default via fee1:5::2 658 654 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept' 659 655 660 656 ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 ··· 670 664 ip -net "$nsr2" route change default via 192.168.200.1 671 665 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null 672 666 673 - ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 667 + ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 || ret=1 674 668 ip -net "$nsr2" link set tun6.10 up 675 669 ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad 676 - ip -6 -net "$nsr2" route change default via fee1:5::1 670 + ip -6 -net "$nsr2" route delete default 671 + ip -6 -net "$nsr2" route add default via fee1:5::1 677 672 678 673 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then 679 674 echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2
+136 -6
tools/testing/selftests/net/netfilter/nft_queue.sh
··· 510 510 511 511 udp_listener_ready() 512 512 { 513 - ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345 513 + ss -S -N "$1" -uln -o "sport = :$2" | grep -q "$2" 514 514 } 515 515 516 516 output_files_written() ··· 518 518 test -s "$1" && test -s "$2" 519 519 } 520 520 521 - test_udp_ct_race() 521 + test_udp_nat_race() 522 522 { 523 523 ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF 524 524 flush ruleset ··· 545 545 ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 & 546 546 local nfqpid=$! 547 547 548 - busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 549 - busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 548 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12345 549 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 12345 550 550 busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12 551 551 552 552 # Send two packets, one should end up in ns1, other in ns2. ··· 557 557 558 558 busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2" 559 559 560 - kill "$nfqpid" 560 + kill "$nfqpid" "$rpid1" "$rpid2" 561 561 562 562 if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then 563 563 echo "FAIL: Expected One udp conntrack entry" ··· 583 583 fi 584 584 585 585 echo "PASS: both udp receivers got one packet each" 586 + } 587 + 588 + # Make sure UDPGRO aggregated packets don't lose 589 + # their skb->nfct entry when nfqueue passes the 590 + # skb to userspace with software gso segmentation on. 591 + test_udp_gro_ct() 592 + { 593 + local errprefix="FAIL: test_udp_gro_ct:" 594 + 595 + ip netns exec "$nsrouter" conntrack -F 2>/dev/null 596 + 597 + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF 598 + flush ruleset 599 + table inet udpq { 600 + # Number of packets/bytes queued to userspace 601 + counter toqueue { } 602 + # Number of packets/bytes reinjected from userspace with 'ct new' intact 603 + counter fromqueue { } 604 + # These two counters should be identical and not 0. 605 + 606 + chain prerouting { 607 + type filter hook prerouting priority -300; policy accept; 608 + 609 + # userspace sends small packets, if < 1000, UDPGRO did 610 + # not kick in, but test needs a 'new' conntrack with udpgro skb. 611 + meta iifname veth0 meta l4proto udp meta length > 1000 accept 612 + 613 + # don't pick up non-gso packets and don't queue them to 614 + # userspace. 615 + notrack 616 + } 617 + 618 + chain postrouting { 619 + type filter hook postrouting priority 0; policy accept; 620 + 621 + # Only queue unconfirmed fraglist gro skbs to userspace. 622 + udp dport 12346 ct status ! confirmed counter name "toqueue" mark set 1 queue num 1 623 + } 624 + 625 + chain validate { 626 + type filter hook postrouting priority 1; policy accept; 627 + # ... and only count those that were reinjected with the 628 + # skb->nfct intact. 629 + mark 1 counter name "fromqueue" 630 + } 631 + } 632 + EOF 633 + timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc & 634 + local rpid=$! 635 + 636 + ip netns exec "$nsrouter" ./nf_queue -G -c -q 1 -t 2 > "$TMPFILE2" & 637 + local nfqpid=$! 638 + 639 + ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding on rx-gro-list on generic-receive-offload on 640 + 641 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12346 642 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1 643 + 644 + local bs=512 645 + local count=$(((32 * 1024 * 1024) / bs)) 646 + dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 16); do 647 + timeout 5 ip netns exec "$ns1" \ 648 + socat -u -b 512 STDIN UDP-DATAGRAM:10.0.2.99:12346,reuseport,bind=0.0.0.0:55221 & 649 + done 650 + 651 + busywait 10000 test -s "$TMPFILE1" 652 + 653 + kill "$rpid" 654 + 655 + wait 656 + 657 + local p 658 + local b 659 + local pqueued 660 + local bqueued 661 + 662 + c=$(ip netns exec "$nsrouter" nft list counter inet udpq "toqueue" | grep packets) 663 + read p pqueued b bqueued <<EOF 664 + $c 665 + EOF 666 + local preinject 667 + local breinject 668 + c=$(ip netns exec "$nsrouter" nft list counter inet udpq "fromqueue" | grep packets) 669 + read p preinject b breinject <<EOF 670 + $c 671 + EOF 672 + ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding off 673 + ip netns exec "$nsrouter" ethtool -K "veth1" rx-udp-gro-forwarding off 674 + 675 + if [ "$pqueued" -eq 0 ];then 676 + # happens when gro did not build at least on aggregate 677 + echo "SKIP: No packets were queued" 678 + return 679 + fi 680 + 681 + local saw_ct_entry=0 682 + if ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12346 2>/dev/null | wc -l | grep -q "^1"'; then 683 + saw_ct_entry=1 684 + else 685 + echo "$errprefix Expected udp conntrack entry" 686 + ip netns exec "$nsrouter" conntrack -L 687 + ret=1 688 + fi 689 + 690 + if [ "$pqueued" -ge "$preinject" ] ;then 691 + echo "$errprefix Expected software segmentation to occur, had $pqueued and $preinject" 692 + ret=1 693 + return 694 + fi 695 + 696 + # sw segmentation adds extra udp and ip headers. 697 + local breinject_expect=$((preinject * (512 + 20 + 8))) 698 + 699 + if [ "$breinject" -eq "$breinject_expect" ]; then 700 + if [ "$saw_ct_entry" -eq 1 ];then 701 + echo "PASS: fraglist gro skb passed with conntrack entry" 702 + else 703 + echo "$errprefix fraglist gro skb passed without conntrack entry" 704 + ret=1 705 + fi 706 + else 707 + echo "$errprefix Counter mismatch, conntrack entry dropped by nfqueue? Queued: $pqueued, $bqueued. Post-queue: $preinject, $breinject. Expected $breinject_expect" 708 + ret=1 709 + fi 710 + 711 + if ! ip netns exec "$nsrouter" nft delete table inet udpq; then 712 + echo "$errprefix: Could not delete udpq table" 713 + ret=1 714 + fi 586 715 } 587 716 588 717 test_queue_removal() ··· 792 663 test_tcp_localhost_requeue 793 664 test_sctp_forward 794 665 test_sctp_output 795 - test_udp_ct_race 666 + test_udp_nat_race 667 + test_udp_gro_ct 796 668 797 669 # should be last, adds vrf device in ns1 and changes routes 798 670 test_icmp_vrf