net_sched: add Qdisc_read_mostly and Qdisc_write groups

It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in
fast path by reducing this to a single dirtied cache line.

In current layout, we change only four/six fields in the first cache line:
- q.spinlock
- q.qlen
- bstats.bytes
- bstats.packets
- some Qdisc also change q.next/q.prev

In the second cache line we change in the fast path:
- running
- state
- qstats.backlog

/* --- cacheline 2 boundary (128 bytes) --- */
struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */
struct qdisc_skb_head q; /* 0x98 0x18 */
struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */

/* --- cacheline 3 boundary (192 bytes) --- */
struct gnet_stats_queue qstats; /* 0xc0 0x14 */
bool running; /* 0xd4 0x1 */

/* XXX 3 bytes hole, try to pack */

unsigned long state; /* 0xd8 0x8 */
struct Qdisc * next_sched; /* 0xe0 0x8 */
struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */
/* --- cacheline 4 boundary (256 bytes) --- */

Reorganize things to have a first cache line mostly read,
then a mostly written one.

This gives a ~3% increase of performance under tx stress.

Note that there is an additional hole because @qstats now spans over a third cache line.

/* --- cacheline 2 boundary (128 bytes) --- */
__u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */
struct sk_buff_head gso_skb; /* 0x80 0x18 */
struct Qdisc * next_sched; /* 0x98 0x8 */
struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */
__u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */

/* XXX 8 bytes hole, try to pack */

/* --- cacheline 3 boundary (192 bytes) --- */
__u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */
struct qdisc_skb_head q; /* 0xc0 0x18 */
unsigned long state; /* 0xd8 0x8 */
struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */
bool running; /* 0xf0 0x1 */

/* XXX 3 bytes hole, try to pack */

struct gnet_stats_queue qstats; /* 0xf4 0x14 */
/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
__u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */

/* XXX 56 bytes hole, try to pack */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20251121083256.674562-8-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Eric Dumazet and committed by

Paolo Abeni 6 months ago ad50d5a3 c5d34f45

+18 -11

1 changed file

expand all

include

net

sch_generic.h

+18 -11

include/net/sch_generic.h

··· 103 103 int pad; 104 104 refcount_t refcnt; 105 105 106 - /* 107 - * For performance sake on SMP, we put highly modified fields at the end 108 - */ 109 - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; 110 - struct qdisc_skb_head q; 111 - struct gnet_stats_basic_sync bstats; 112 - struct gnet_stats_queue qstats; 113 - bool running; /* must be written under qdisc spinlock */ 114 - unsigned long state; 115 - struct Qdisc *next_sched; 116 - struct sk_buff_head skb_bad_txq; 106 + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ 107 + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; 108 + struct sk_buff_head gso_skb; 109 + struct Qdisc *next_sched; 110 + struct sk_buff_head skb_bad_txq; 111 + __cacheline_group_end(Qdisc_read_mostly); 112 + 113 + /* Fields dirtied in dequeue() fast path. */ 114 + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; 115 + struct qdisc_skb_head q; 116 + unsigned long state; 117 + struct gnet_stats_basic_sync bstats; 118 + bool running; /* must be written under qdisc spinlock */ 119 + 120 + /* Note : we only change qstats.backlog in fast path. */ 121 + struct gnet_stats_queue qstats; 122 + __cacheline_group_end(Qdisc_write); 123 + 117 124 118 125 atomic_long_t defer_count ____cacheline_aligned_in_smp; 119 126 struct llist_head defer_list;

Configure Feed

Configure Feed