Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: sched: refine software bypass handling in tc_run

This patch addresses issues with filter counting in block (tcf_block),
particularly for software bypass scenarios, by introducing a more
accurate mechanism using useswcnt.

Previously, filtercnt and skipswcnt were introduced by:

Commit 2081fd3445fe ("net: sched: cls_api: add filter counter") and
Commit f631ef39d819 ("net: sched: cls_api: add skip_sw counter")

filtercnt tracked all tp (tcf_proto) objects added to a block, and
skipswcnt counted tp objects with the skipsw attribute set.

The problem is: a single tp can contain multiple filters, some with skipsw
and others without. The current implementation fails in the case:

When the first filter in a tp has skipsw, both skipswcnt and filtercnt
are incremented, then adding a second filter without skipsw to the same
tp does not modify these counters because tp->counted is already set.

This results in bypass software behavior based solely on skipswcnt
equaling filtercnt, even when the block includes filters without
skipsw. Consequently, filters without skipsw are inadvertently bypassed.

To address this, the patch introduces useswcnt in block to explicitly count
tp objects containing at least one filter without skipsw. Key changes
include:

Whenever a filter without skipsw is added, its tp is marked with usesw
and counted in useswcnt. tc_run() now uses useswcnt to determine software
bypass, eliminating reliance on filtercnt and skipswcnt.

This refined approach prevents software bypass for blocks containing
mixed filters, ensuring correct behavior in tc_run().

Additionally, as atomic operations on useswcnt ensure thread safety and
tp->lock guards access to tp->usesw and tp->counted, the broader lock
down_write(&block->cb_lock) is no longer required in tc_new_tfilter(),
and this resolves a performance regression caused by the filter counting
mechanism during parallel filter insertions.

The improvement can be demonstrated using the following script:

# cat insert_tc_rules.sh

tc qdisc add dev ens1f0np0 ingress
for i in $(seq 16); do
taskset -c $i tc -b rules_$i.txt &
done
wait

Each of rules_$i.txt files above includes 100000 tc filter rules to a
mlx5 driver NIC ens1f0np0.

Without this patch:

# time sh insert_tc_rules.sh

real 0m50.780s
user 0m23.556s
sys 4m13.032s

With this patch:

# time sh insert_tc_rules.sh

real 0m17.718s
user 0m7.807s
sys 3m45.050s

Fixes: 047f340b36fc ("net: sched: make skip_sw actually skip software")
Reported-by: Shuang Li <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Tested-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Xin Long and committed by
David S. Miller
a12c76a0 59372af6

+55 -45
+11 -2
include/net/pkt_cls.h
··· 75 75 } 76 76 77 77 #ifdef CONFIG_NET_CLS_ACT 78 - DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); 78 + DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); 79 79 80 80 static inline bool tcf_block_bypass_sw(struct tcf_block *block) 81 81 { 82 - return block && block->bypass_wanted; 82 + return block && !atomic_read(&block->useswcnt); 83 83 } 84 84 #endif 85 85 ··· 758 758 cls_common->skip_sw = tc_skip_sw(flags); 759 759 if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) 760 760 cls_common->extack = extack; 761 + } 762 + 763 + static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) 764 + { 765 + if (tp->usesw) 766 + return; 767 + if (tc_skip_sw(flags) && tc_in_hw(flags)) 768 + return; 769 + tp->usesw = true; 761 770 } 762 771 763 772 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+2 -3
include/net/sch_generic.h
··· 425 425 spinlock_t lock; 426 426 bool deleting; 427 427 bool counted; 428 + bool usesw; 428 429 refcount_t refcnt; 429 430 struct rcu_head rcu; 430 431 struct hlist_node destroy_ht_node; ··· 475 474 struct flow_block flow_block; 476 475 struct list_head owner_list; 477 476 bool keep_dst; 478 - bool bypass_wanted; 479 - atomic_t filtercnt; /* Number of filters */ 480 - atomic_t skipswcnt; /* Number of skip_sw filters */ 477 + atomic_t useswcnt; 481 478 atomic_t offloadcnt; /* Number of oddloaded filters */ 482 479 unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ 483 480 unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
+9 -6
net/core/dev.c
··· 2248 2248 #endif 2249 2249 2250 2250 #ifdef CONFIG_NET_CLS_ACT 2251 - DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); 2252 - EXPORT_SYMBOL(tcf_bypass_check_needed_key); 2251 + DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); 2252 + EXPORT_SYMBOL(tcf_sw_enabled_key); 2253 2253 #endif 2254 2254 2255 2255 DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); ··· 4144 4144 if (!miniq) 4145 4145 return ret; 4146 4146 4147 - if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { 4148 - if (tcf_block_bypass_sw(miniq->block)) 4149 - return ret; 4150 - } 4147 + /* Global bypass */ 4148 + if (!static_branch_likely(&tcf_sw_enabled_key)) 4149 + return ret; 4150 + 4151 + /* Block-wise bypass */ 4152 + if (tcf_block_bypass_sw(miniq->block)) 4153 + return ret; 4151 4154 4152 4155 tc_skb_cb(skb)->mru = 0; 4153 4156 tc_skb_cb(skb)->post_ct = false;
+23 -34
net/sched/cls_api.c
··· 390 390 tp->protocol = protocol; 391 391 tp->prio = prio; 392 392 tp->chain = chain; 393 + tp->usesw = !tp->ops->reoffload; 393 394 spin_lock_init(&tp->lock); 394 395 refcount_set(&tp->refcnt, 1); 395 396 ··· 411 410 refcount_inc(&tp->refcnt); 412 411 } 413 412 414 - static void tcf_maintain_bypass(struct tcf_block *block) 413 + static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add) 415 414 { 416 - int filtercnt = atomic_read(&block->filtercnt); 417 - int skipswcnt = atomic_read(&block->skipswcnt); 418 - bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; 419 - 420 - if (bypass_wanted != block->bypass_wanted) { 421 415 #ifdef CONFIG_NET_CLS_ACT 422 - if (bypass_wanted) 423 - static_branch_inc(&tcf_bypass_check_needed_key); 424 - else 425 - static_branch_dec(&tcf_bypass_check_needed_key); 426 - #endif 427 - block->bypass_wanted = bypass_wanted; 428 - } 429 - } 416 + struct tcf_block *block = tp->chain->block; 417 + bool counted = false; 430 418 431 - static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) 432 - { 433 - lockdep_assert_not_held(&block->cb_lock); 434 - 435 - down_write(&block->cb_lock); 436 - if (*counted != add) { 437 - if (add) { 438 - atomic_inc(&block->filtercnt); 439 - *counted = true; 440 - } else { 441 - atomic_dec(&block->filtercnt); 442 - *counted = false; 419 + if (!add) { 420 + if (tp->usesw && tp->counted) { 421 + if (!atomic_dec_return(&block->useswcnt)) 422 + static_branch_dec(&tcf_sw_enabled_key); 423 + tp->counted = false; 443 424 } 425 + return; 444 426 } 445 - tcf_maintain_bypass(block); 446 - up_write(&block->cb_lock); 427 + 428 + spin_lock(&tp->lock); 429 + if (tp->usesw && !tp->counted) { 430 + counted = true; 431 + tp->counted = true; 432 + } 433 + spin_unlock(&tp->lock); 434 + 435 + if (counted && atomic_inc_return(&block->useswcnt) == 1) 436 + static_branch_inc(&tcf_sw_enabled_key); 437 + #endif 447 438 } 448 439 449 440 static void tcf_chain_put(struct tcf_chain *chain); ··· 444 451 bool sig_destroy, struct netlink_ext_ack *extack) 445 452 { 446 453 tp->ops->destroy(tp, rtnl_held, extack); 447 - tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); 454 + tcf_proto_count_usesw(tp, false); 448 455 if (sig_destroy) 449 456 tcf_proto_signal_destroyed(tp->chain, tp); 450 457 tcf_chain_put(tp->chain); ··· 2402 2409 tfilter_notify(net, skb, n, tp, block, q, parent, fh, 2403 2410 RTM_NEWTFILTER, false, rtnl_held, extack); 2404 2411 tfilter_put(tp, fh); 2405 - tcf_block_filter_cnt_update(block, &tp->counted, true); 2412 + tcf_proto_count_usesw(tp, true); 2406 2413 /* q pointer is NULL for shared blocks */ 2407 2414 if (q) 2408 2415 q->flags &= ~TCQ_F_CAN_BYPASS; ··· 3525 3532 if (*flags & TCA_CLS_FLAGS_IN_HW) 3526 3533 return; 3527 3534 *flags |= TCA_CLS_FLAGS_IN_HW; 3528 - if (tc_skip_sw(*flags)) 3529 - atomic_inc(&block->skipswcnt); 3530 3535 atomic_inc(&block->offloadcnt); 3531 3536 } 3532 3537 ··· 3533 3542 if (!(*flags & TCA_CLS_FLAGS_IN_HW)) 3534 3543 return; 3535 3544 *flags &= ~TCA_CLS_FLAGS_IN_HW; 3536 - if (tc_skip_sw(*flags)) 3537 - atomic_dec(&block->skipswcnt); 3538 3545 atomic_dec(&block->offloadcnt); 3539 3546 } 3540 3547
+2
net/sched/cls_bpf.c
··· 509 509 if (!tc_in_hw(prog->gen_flags)) 510 510 prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; 511 511 512 + tcf_proto_update_usesw(tp, prog->gen_flags); 513 + 512 514 if (oldprog) { 513 515 idr_replace(&head->handle_idr, prog, handle); 514 516 list_replace_rcu(&oldprog->link, &prog->link);
+2
net/sched/cls_flower.c
··· 2503 2503 if (!tc_in_hw(fnew->flags)) 2504 2504 fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; 2505 2505 2506 + tcf_proto_update_usesw(tp, fnew->flags); 2507 + 2506 2508 spin_lock(&tp->lock); 2507 2509 2508 2510 /* tp was deleted concurrently. -EAGAIN will cause caller to lookup
+2
net/sched/cls_matchall.c
··· 228 228 if (!tc_in_hw(new->flags)) 229 229 new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; 230 230 231 + tcf_proto_update_usesw(tp, new->flags); 232 + 231 233 *arg = head; 232 234 rcu_assign_pointer(tp->root, new); 233 235 return 0;
+4
net/sched/cls_u32.c
··· 951 951 if (!tc_in_hw(new->flags)) 952 952 new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; 953 953 954 + tcf_proto_update_usesw(tp, new->flags); 955 + 954 956 u32_replace_knode(tp, tp_c, new); 955 957 tcf_unbind_filter(tp, &n->res); 956 958 tcf_exts_get_net(&n->exts); ··· 1165 1163 1166 1164 if (!tc_in_hw(n->flags)) 1167 1165 n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; 1166 + 1167 + tcf_proto_update_usesw(tp, n->flags); 1168 1168 1169 1169 ins = &ht->ht[TC_U32_HASH(handle)]; 1170 1170 for (pins = rtnl_dereference(*ins); pins;