Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'net-deal-with-sticky-tx-queues'

Eric Dumazet says:

====================
net: deal with sticky tx queues

Back in 2010, Tom Herbert added skb->ooo_okay to TCP flows.

Extend the feature to connected flows for other protocols like UDP.

skb->ooo_okay might never be set for bulk flows that always
have at least one skb in a qdisc queue of NIC queue,
especially if TX completion is delayed because of a stressed cpu
or aggressive interrupt mitigation.

The so-called "strange attractors" has caused many performance
issues, we need to do better now that TCP reacts better to
potential reorders.

Add new net.core.txq_reselection_ms sysctl to let
flows follow XPS and select a more efficient queue.

After this series, we no longer have to make sure threads
are pinned to cpus, they can migrate without adding
too much [spinlock, qdisc, TX completion] pressure anymore.
====================

Link: https://patch.msgid.link/20251013152234.842065-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+81 -22
+17
Documentation/admin-guide/sysctl/net.rst
··· 406 406 If set to 1 (default), hash rethink is performed on listening socket. 407 407 If set to 0, hash rethink is not performed. 408 408 409 + txq_reselection_ms 410 + ------------------ 411 + 412 + Controls how often (in ms) a busy connected flow can select another tx queue. 413 + 414 + A resection is desirable when/if user thread has migrated and XPS 415 + would select a different queue. Same can occur without XPS 416 + if the flow hash has changed. 417 + 418 + But switching txq can introduce reorders, especially if the 419 + old queue is under high pressure. Modern TCP stacks deal 420 + well with reorders if they happen not too often. 421 + 422 + To disable this feature, set the value to 0. 423 + 424 + Default : 1000 425 + 409 426 gro_normal_batch 410 427 ---------------- 411 428
+1
include/net/netns/core.h
··· 13 13 struct ctl_table_header *sysctl_hdr; 14 14 15 15 int sysctl_somaxconn; 16 + int sysctl_txq_reselection; 16 17 int sysctl_optmem_max; 17 18 u8 sysctl_txrehash; 18 19 u8 sysctl_tstamp_allow_data;
+14 -15
include/net/sock.h
··· 313 313 * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock 314 314 * for timestamping 315 315 * @sk_tskey: counter to disambiguate concurrent tstamp requests 316 + * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh. 316 317 * @sk_zckey: counter to order MSG_ZEROCOPY notifications 317 318 * @sk_socket: Identd and reporting IO signals 318 319 * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. ··· 486 485 unsigned long sk_pacing_rate; /* bytes per second */ 487 486 atomic_t sk_zckey; 488 487 atomic_t sk_tskey; 488 + unsigned long sk_tx_queue_mapping_jiffies; 489 489 __cacheline_group_end(sock_write_tx); 490 490 491 491 __cacheline_group_begin(sock_read_tx); ··· 1994 1992 /* Paired with READ_ONCE() in sk_tx_queue_get() and 1995 1993 * other WRITE_ONCE() because socket lock might be not held. 1996 1994 */ 1997 - WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); 1995 + if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) { 1996 + WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); 1997 + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); 1998 + return; 1999 + } 2000 + 2001 + /* Refresh sk_tx_queue_mapping_jiffies if too old. */ 2002 + if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ)) 2003 + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); 1998 2004 } 1999 2005 2000 2006 #define NO_QUEUE_MAPPING USHRT_MAX ··· 2015 2005 WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); 2016 2006 } 2017 2007 2018 - static inline int sk_tx_queue_get(const struct sock *sk) 2019 - { 2020 - if (sk) { 2021 - /* Paired with WRITE_ONCE() in sk_tx_queue_clear() 2022 - * and sk_tx_queue_set(). 2023 - */ 2024 - int val = READ_ONCE(sk->sk_tx_queue_mapping); 2025 - 2026 - if (val != NO_QUEUE_MAPPING) 2027 - return val; 2028 - } 2029 - return -1; 2030 - } 2008 + int sk_tx_queue_get(const struct sock *sk); 2031 2009 2032 2010 static inline void __sk_rx_queue_set(struct sock *sk, 2033 2011 const struct sk_buff *skb, ··· 2301 2303 return 0; 2302 2304 } 2303 2305 2306 + #define SK_WMEM_ALLOC_BIAS 1 2304 2307 /** 2305 2308 * sk_wmem_alloc_get - returns write allocations 2306 2309 * @sk: socket ··· 2310 2311 */ 2311 2312 static inline int sk_wmem_alloc_get(const struct sock *sk) 2312 2313 { 2313 - return refcount_read(&sk->sk_wmem_alloc) - 1; 2314 + return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS; 2314 2315 } 2315 2316 2316 2317 /**
+1 -1
net/atm/common.c
··· 157 157 memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc)); 158 158 memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc)); 159 159 vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */ 160 - refcount_set(&sk->sk_wmem_alloc, 1); 160 + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 161 161 atomic_set(&sk->sk_rmem_alloc, 0); 162 162 vcc->push = NULL; 163 163 vcc->pop = NULL;
+27 -2
net/core/dev.c
··· 4591 4591 } 4592 4592 EXPORT_SYMBOL(dev_pick_tx_zero); 4593 4593 4594 + int sk_tx_queue_get(const struct sock *sk) 4595 + { 4596 + int resel, val; 4597 + 4598 + if (!sk) 4599 + return -1; 4600 + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() 4601 + * and sk_tx_queue_set(). 4602 + */ 4603 + val = READ_ONCE(sk->sk_tx_queue_mapping); 4604 + 4605 + if (val == NO_QUEUE_MAPPING) 4606 + return -1; 4607 + 4608 + if (!sk_fullsock(sk)) 4609 + return val; 4610 + 4611 + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); 4612 + if (resel && time_is_before_jiffies( 4613 + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) 4614 + return -1; 4615 + 4616 + return val; 4617 + } 4618 + EXPORT_SYMBOL(sk_tx_queue_get); 4619 + 4594 4620 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, 4595 4621 struct net_device *sb_dev) 4596 4622 { ··· 4632 4606 if (new_index < 0) 4633 4607 new_index = skb_tx_hash(dev, sb_dev, skb); 4634 4608 4635 - if (queue_index != new_index && sk && 4636 - sk_fullsock(sk) && 4609 + if (sk && sk_fullsock(sk) && 4637 4610 rcu_access_pointer(sk->sk_dst_cache)) 4638 4611 sk_tx_queue_set(sk, new_index); 4639 4612
+1
net/core/net_namespace.c
··· 395 395 net->core.sysctl_optmem_max = 128 * 1024; 396 396 net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; 397 397 net->core.sysctl_tstamp_allow_data = 1; 398 + net->core.sysctl_txq_reselection = msecs_to_jiffies(1000); 398 399 } 399 400 400 401 /* init code that must occur even if setup_net() is not called. */
+13 -4
net/core/sock.c
··· 2313 2313 } 2314 2314 2315 2315 sock_net_set(sk, net); 2316 - refcount_set(&sk->sk_wmem_alloc, 1); 2316 + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2317 2317 2318 2318 mem_cgroup_sk_alloc(sk); 2319 2319 cgroup_sk_alloc(&sk->sk_cgrp_data); ··· 2494 2494 2495 2495 atomic_set(&newsk->sk_rmem_alloc, 0); 2496 2496 2497 - /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2498 - refcount_set(&newsk->sk_wmem_alloc, 1); 2497 + refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2499 2498 2500 2499 atomic_set(&newsk->sk_omem_alloc, 0); 2501 2500 sk_init_common(newsk); ··· 2694 2695 2695 2696 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2696 2697 { 2698 + int old_wmem; 2699 + 2697 2700 skb_orphan(skb); 2698 2701 #ifdef CONFIG_INET 2699 2702 if (unlikely(!sk_fullsock(sk))) ··· 2709 2708 * is enough to guarantee sk_free() won't free this sock until 2710 2709 * all in-flight packets are completed 2711 2710 */ 2712 - refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2711 + __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); 2712 + 2713 + /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket 2714 + * is in a host queue (qdisc, NIC queue). 2715 + * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue 2716 + * based on XPS for better performance. 2717 + * Otherwise clear ooo_okay to not risk Out Of Order delivery. 2718 + */ 2719 + skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); 2713 2720 } 2714 2721 EXPORT_SYMBOL(skb_set_owner_w); 2715 2722
+7
net/core/sysctl_net_core.c
··· 668 668 .proc_handler = proc_dou8vec_minmax, 669 669 }, 670 670 { 671 + .procname = "txq_reselection_ms", 672 + .data = &init_net.core.sysctl_txq_reselection, 673 + .maxlen = sizeof(int), 674 + .mode = 0644, 675 + .proc_handler = proc_dointvec_ms_jiffies, 676 + }, 677 + { 671 678 .procname = "tstamp_allow_data", 672 679 .data = &init_net.core.sysctl_tstamp_allow_data, 673 680 .maxlen = sizeof(u8),