Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'veth-qdisc-backpressure-and-qdisc-check-refactor'

Jesper Dangaard Brouer says:

====================
veth: qdisc backpressure and qdisc check refactor

This patch series addresses TX drops seen on veth devices under load,
particularly when using threaded NAPI, which is our setup in production.

The root cause is that the NAPI consumer often runs on a different CPU
than the producer. Combined with scheduling delays or simply slower
consumption, this increases the chance that the ptr_ring fills up before
packets are drained, resulting in drops from veth_xmit() (ndo_start_xmit()).

To make this easier to reproduce, we’ve created a script that sets up a
test scenario using network namespaces. The script inserts 1000 iptables
rules in the consumer namespace to slow down packet processing and
amplify the issue. Reproducer script:

https://github.com/xdp-project/xdp-project/blob/main/areas/core/veth_setup01_NAPI_TX_drops.sh

This series first introduces a helper to detect no-queue qdiscs and then
uses it in the veth driver to conditionally apply qdisc-level
backpressure when a real qdisc is attached. The behavior is off by
default and opt-in, ensuring minimal impact and easy activation.

v6: https://lore.kernel.org/174549933665.608169.392044991754158047.stgit@firesoul
v5: https://lore.kernel.org/174489803410.355490.13216831426556849084.stgit@firesoul
v4 https://lore.kernel.org/174472463778.274639.12670590457453196991.stgit@firesoul
v3: https://lore.kernel.org/174464549885.20396.6987653753122223942.stgit@firesoul
v2: https://lore.kernel.org/174412623473.3702169.4235683143719614624.stgit@firesoul
RFC-v1: https://lore.kernel.org/174377814192.3376479.16481605648460889310.stgit@firesoul
====================

Link: https://patch.msgid.link/174559288731.827981.8748257839971869213.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+56 -13
+47 -10
drivers/net/veth.c
··· 307 307 308 308 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 309 309 { 310 - if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 311 - dev_kfree_skb_any(skb); 312 - return NET_RX_DROP; 313 - } 310 + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) 311 + return NETDEV_TX_BUSY; /* signal qdisc layer */ 314 312 315 - return NET_RX_SUCCESS; 313 + return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ 316 314 } 317 315 318 316 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, ··· 344 346 { 345 347 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 346 348 struct veth_rq *rq = NULL; 347 - int ret = NETDEV_TX_OK; 349 + struct netdev_queue *txq; 348 350 struct net_device *rcv; 349 351 int length = skb->len; 350 352 bool use_napi = false; 351 - int rxq; 353 + int ret, rxq; 352 354 353 355 rcu_read_lock(); 354 356 rcv = rcu_dereference(priv->peer); ··· 371 373 } 372 374 373 375 skb_tx_timestamp(skb); 374 - if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { 376 + 377 + ret = veth_forward_skb(rcv, skb, rq, use_napi); 378 + switch (ret) { 379 + case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ 375 380 if (!use_napi) 376 381 dev_sw_netstats_tx_add(dev, 1, length); 377 382 else 378 383 __veth_xdp_flush(rq); 379 - } else { 384 + break; 385 + case NETDEV_TX_BUSY: 386 + /* If a qdisc is attached to our virtual device, returning 387 + * NETDEV_TX_BUSY is allowed. 388 + */ 389 + txq = netdev_get_tx_queue(dev, rxq); 390 + 391 + if (qdisc_txq_has_no_queue(txq)) { 392 + dev_kfree_skb_any(skb); 393 + goto drop; 394 + } 395 + /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ 396 + __skb_push(skb, ETH_HLEN); 397 + /* Depend on prior success packets started NAPI consumer via 398 + * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped, 399 + * paired with empty check in veth_poll(). 400 + */ 401 + netif_tx_stop_queue(txq); 402 + smp_mb__after_atomic(); 403 + if (unlikely(__ptr_ring_empty(&rq->xdp_ring))) 404 + netif_tx_wake_queue(txq); 405 + break; 406 + case NET_RX_DROP: /* same as NET_XMIT_DROP */ 380 407 drop: 381 408 atomic64_inc(&priv->dropped); 382 409 ret = NET_XMIT_DROP; 410 + break; 411 + default: 412 + net_crit_ratelimited("%s(%s): Invalid return code(%d)", 413 + __func__, dev->name, ret); 383 414 } 384 - 385 415 rcu_read_unlock(); 386 416 387 417 return ret; ··· 900 874 struct veth_xdp_tx_bq *bq, 901 875 struct veth_stats *stats) 902 876 { 877 + struct veth_priv *priv = netdev_priv(rq->dev); 878 + int queue_idx = rq->xdp_rxq.queue_index; 879 + struct netdev_queue *peer_txq; 880 + struct net_device *peer_dev; 903 881 int i, done = 0, n_xdpf = 0; 904 882 void *xdpf[VETH_XDP_BATCH]; 883 + 884 + /* NAPI functions as RCU section */ 885 + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); 886 + peer_txq = netdev_get_tx_queue(peer_dev, queue_idx); 905 887 906 888 for (i = 0; i < budget; i++) { 907 889 void *ptr = __ptr_ring_consume(&rq->xdp_ring); ··· 958 924 rq->stats.vs.rx_drops += stats->rx_drops; 959 925 rq->stats.vs.xdp_packets += done; 960 926 u64_stats_update_end(&rq->stats.syncp); 927 + 928 + if (unlikely(netif_tx_queue_stopped(peer_txq))) 929 + netif_tx_wake_queue(peer_txq); 961 930 962 931 return done; 963 932 }
+1 -3
drivers/net/vrf.c
··· 343 343 static bool qdisc_tx_is_default(const struct net_device *dev) 344 344 { 345 345 struct netdev_queue *txq; 346 - struct Qdisc *qdisc; 347 346 348 347 if (dev->num_tx_queues > 1) 349 348 return false; 350 349 351 350 txq = netdev_get_tx_queue(dev, 0); 352 - qdisc = rcu_access_pointer(txq->qdisc); 353 351 354 - return !qdisc->enqueue; 352 + return qdisc_txq_has_no_queue(txq); 355 353 } 356 354 357 355 /* Local traffic destined to local address. Reinsert the packet to rx
+8
include/net/sch_generic.h
··· 803 803 return false; 804 804 } 805 805 806 + /* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ 807 + static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) 808 + { 809 + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); 810 + 811 + return qdisc->enqueue == NULL; 812 + } 813 + 806 814 /* Is the device using the noop qdisc on all queues? */ 807 815 static inline bool qdisc_tx_is_noop(const struct net_device *dev) 808 816 {