Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'udp-increase-rx-performance-under-stress'

Eric Dumazet says:

====================
udp: increase RX performance under stress

This series is the result of careful analysis of UDP stack,
to optimize the receive side, especially when under one or several
UDP sockets are receiving a DDOS attack.

I have measured a 47 % increase of throughput when using
IPv6 UDP packets with 120 bytes of payload, under DDOS.

16 cpus are receiving traffic targeting a single socket.

Even after adding NUMA aware drop counters, we were suffering
from false sharing between packet producers and the consumer.

1) First four patches are shrinking struct ipv6_pinfo size
and reorganize fields to get more efficient TX path.
They should also benefit TCP, by removing one cache line miss.

2) patches 5 & 6 changes how sk->sk_rmem_alloc is read and updated.
They reduce reduce spinlock contention on the busylock.

3) Patches 7 & 8 change the ordering of sk_backlog (including
sk_rmem_alloc) sk_receive_queue and sk_drop_counters for
better data locality.

4) Patch 9 removes the hashed array of spinlocks in favor of
a per-udp-socket one.

5) Final patch adopts skb_attempt_defer_free(), after TCP got
good results with it.
====================

Link: https://patch.msgid.link/20250916160951.541279-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+70 -70
+19 -20
include/linux/ipv6.h
··· 214 214 215 215 /* struct ipv6_pinfo - ipv6 private area */ 216 216 struct ipv6_pinfo { 217 + /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ 217 218 struct in6_addr saddr; 218 - struct in6_pktinfo sticky_pktinfo; 219 - const struct in6_addr *daddr_cache; 220 - #ifdef CONFIG_IPV6_SUBTREES 221 - const struct in6_addr *saddr_cache; 222 - #endif 223 - 224 219 __be32 flow_label; 225 - __u32 frag_size; 226 - 220 + u32 dst_cookie; 221 + struct ipv6_txoptions __rcu *opt; 227 222 s16 hop_limit; 223 + u8 pmtudisc; 224 + u8 tclass; 225 + #ifdef CONFIG_IPV6_SUBTREES 226 + bool saddr_cache; 227 + #endif 228 + bool daddr_cache; 229 + 228 230 u8 mcast_hops; 231 + u32 frag_size; 229 232 230 233 int ucast_oif; 231 234 int mcast_oif; ··· 236 233 /* pktoption flags */ 237 234 union { 238 235 struct { 239 - __u16 srcrt:1, 236 + u16 srcrt:1, 240 237 osrcrt:1, 241 238 rxinfo:1, 242 239 rxoinfo:1, ··· 253 250 recvfragsize:1; 254 251 /* 1 bits hole */ 255 252 } bits; 256 - __u16 all; 253 + u16 all; 257 254 } rxopt; 258 255 259 256 /* sockopt flags */ 260 - __u8 srcprefs; /* 001: prefer temporary address 257 + u8 srcprefs; /* 001: prefer temporary address 261 258 * 010: prefer public address 262 259 * 100: prefer care-of address 263 260 */ 264 - __u8 pmtudisc; 265 - __u8 min_hopcount; 266 - __u8 tclass; 261 + u8 min_hopcount; 267 262 __be32 rcv_flowinfo; 263 + struct in6_pktinfo sticky_pktinfo; 268 264 269 - __u32 dst_cookie; 265 + struct sk_buff *pktoptions; 266 + struct sk_buff *rxpmtu; 267 + struct inet6_cork cork; 270 268 271 269 struct ipv6_mc_socklist __rcu *ipv6_mc_list; 272 270 struct ipv6_ac_socklist *ipv6_ac_list; 273 271 struct ipv6_fl_socklist __rcu *ipv6_fl_list; 274 - 275 - struct ipv6_txoptions __rcu *opt; 276 - struct sk_buff *pktoptions; 277 - struct sk_buff *rxpmtu; 278 - struct inet6_cork cork; 279 272 }; 280 273 281 274 /* We currently use available bits from inet_sk(sk)->inet_flags,
+1
include/linux/udp.h
··· 109 109 */ 110 110 struct hlist_node tunnel_list; 111 111 struct numa_drop_counters drop_counters; 112 + spinlock_t busylock ____cacheline_aligned_in_smp; 112 113 }; 113 114 114 115 #define udp_test_bit(nr, sk) \
+4 -4
include/net/ip6_route.h
··· 229 229 * Store a destination cache entry in a socket 230 230 */ 231 231 static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, 232 - const struct in6_addr *daddr, 233 - const struct in6_addr *saddr) 232 + bool daddr_set, 233 + bool saddr_set) 234 234 { 235 235 struct ipv6_pinfo *np = inet6_sk(sk); 236 236 237 237 np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); 238 238 sk_setup_caps(sk, dst); 239 - np->daddr_cache = daddr; 239 + np->daddr_cache = daddr_set; 240 240 #ifdef CONFIG_IPV6_SUBTREES 241 - np->saddr_cache = saddr; 241 + np->saddr_cache = saddr_set; 242 242 #endif 243 243 } 244 244
+2 -2
include/net/sock.h
··· 394 394 395 395 atomic_t sk_drops; 396 396 __s32 sk_peek_off; 397 - struct sk_buff_head sk_error_queue; 398 397 struct sk_buff_head sk_receive_queue; 399 398 /* 400 399 * The backlog queue is special, it is always used with ··· 411 412 } sk_backlog; 412 413 #define sk_rmem_alloc sk_backlog.rmem_alloc 413 414 415 + struct sk_buff_head sk_error_queue; 414 416 __cacheline_group_end(sock_write_rx); 415 417 416 418 __cacheline_group_begin(sock_read_rx); ··· 451 451 #ifdef CONFIG_XFRM 452 452 struct xfrm_policy __rcu *sk_policy[2]; 453 453 #endif 454 - struct numa_drop_counters *sk_drop_counters; 455 454 __cacheline_group_end(sock_read_rxtx); 456 455 457 456 __cacheline_group_begin(sock_write_rxtx); ··· 567 568 #ifdef CONFIG_BPF_SYSCALL 568 569 struct bpf_local_storage __rcu *sk_bpf_storage; 569 570 #endif 571 + struct numa_drop_counters *sk_drop_counters; 570 572 struct rcu_head sk_rcu; 571 573 netns_tracker ns_tracker; 572 574 struct xarray sk_user_frags;
+6
include/net/udp.h
··· 289 289 struct udp_sock *up = udp_sk(sk); 290 290 291 291 sk->sk_drop_counters = &up->drop_counters; 292 + spin_lock_init(&up->busylock); 292 293 skb_queue_head_init(&up->reader_queue); 293 294 INIT_HLIST_NODE(&up->tunnel_list); 294 295 up->forward_threshold = sk->sk_rcvbuf >> 2; 295 296 set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 297 + } 298 + 299 + static inline void udp_drops_inc(struct sock *sk) 300 + { 301 + numa_drop_add(&udp_sk(sk)->drop_counters, 1); 296 302 } 297 303 298 304 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
-1
net/core/sock.c
··· 4444 4444 #ifdef CONFIG_MEMCG 4445 4445 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4446 4446 #endif 4447 - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); 4448 4447 4449 4448 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4450 4449 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
+22 -28
net/ipv4/udp.c
··· 1689 1689 * to relieve pressure on the receive_queue spinlock shared by consumer. 1690 1690 * Under flood, this means that only one producer can be in line 1691 1691 * trying to acquire the receive_queue spinlock. 1692 - * These busylock can be allocated on a per cpu manner, instead of a 1693 - * per socket one (that would consume a cache line per socket) 1694 1692 */ 1695 - static int udp_busylocks_log __read_mostly; 1696 - static spinlock_t *udp_busylocks __read_mostly; 1697 - 1698 - static spinlock_t *busylock_acquire(void *ptr) 1693 + static spinlock_t *busylock_acquire(struct sock *sk) 1699 1694 { 1700 - spinlock_t *busy; 1695 + spinlock_t *busy = &udp_sk(sk)->busylock; 1701 1696 1702 - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); 1703 1697 spin_lock(busy); 1704 1698 return busy; 1705 1699 } ··· 1733 1739 if (rcvbuf > INT_MAX >> 1) 1734 1740 goto drop; 1735 1741 1736 - /* Always allow at least one packet for small buffer. */ 1737 - if (rmem > rcvbuf) 1742 + /* Accept the packet if queue is empty. */ 1743 + if (rmem) 1738 1744 goto drop; 1739 1745 } 1740 1746 ··· 1747 1753 if (rmem > (rcvbuf >> 1)) { 1748 1754 skb_condense(skb); 1749 1755 size = skb->truesize; 1756 + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); 1757 + if (rmem > rcvbuf) 1758 + goto uncharge_drop; 1750 1759 busy = busylock_acquire(sk); 1760 + } else { 1761 + atomic_add(size, &sk->sk_rmem_alloc); 1751 1762 } 1752 1763 1753 1764 udp_set_dev_scratch(skb); 1754 - 1755 - atomic_add(size, &sk->sk_rmem_alloc); 1756 1765 1757 1766 spin_lock(&list->lock); 1758 1767 err = udp_rmem_schedule(sk, size); ··· 1784 1787 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1785 1788 1786 1789 drop: 1787 - sk_drops_inc(sk); 1790 + udp_drops_inc(sk); 1788 1791 busylock_release(busy); 1789 1792 return err; 1790 1793 } ··· 1825 1828 if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) 1826 1829 sk_peek_offset_bwd(sk, len); 1827 1830 1831 + if (!skb_shared(skb)) { 1832 + if (unlikely(udp_skb_has_head_state(skb))) 1833 + skb_release_head_state(skb); 1834 + skb_attempt_defer_free(skb); 1835 + return; 1836 + } 1837 + 1828 1838 if (!skb_unref(skb)) 1829 1839 return; 1830 1840 ··· 1856 1852 IS_UDPLITE(sk)); 1857 1853 __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, 1858 1854 IS_UDPLITE(sk)); 1859 - sk_drops_inc(sk); 1855 + udp_drops_inc(sk); 1860 1856 __skb_unlink(skb, rcvq); 1861 1857 *total += skb->truesize; 1862 1858 kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); ··· 2012 2008 2013 2009 __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); 2014 2010 __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); 2015 - sk_drops_inc(sk); 2011 + udp_drops_inc(sk); 2016 2012 kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); 2017 2013 goto try_again; 2018 2014 } ··· 2082 2078 2083 2079 if (unlikely(err)) { 2084 2080 if (!peeking) { 2085 - sk_drops_inc(sk); 2081 + udp_drops_inc(sk); 2086 2082 UDP_INC_STATS(sock_net(sk), 2087 2083 UDP_MIB_INERRORS, is_udplite); 2088 2084 } ··· 2453 2449 __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 2454 2450 drop: 2455 2451 __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 2456 - sk_drops_inc(sk); 2452 + udp_drops_inc(sk); 2457 2453 sk_skb_reason_drop(sk, skb, drop_reason); 2458 2454 return -1; 2459 2455 } ··· 2538 2534 nskb = skb_clone(skb, GFP_ATOMIC); 2539 2535 2540 2536 if (unlikely(!nskb)) { 2541 - sk_drops_inc(sk); 2537 + udp_drops_inc(sk); 2542 2538 __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, 2543 2539 IS_UDPLITE(sk)); 2544 2540 __UDP_INC_STATS(net, UDP_MIB_INERRORS, ··· 3998 3994 void __init udp_init(void) 3999 3995 { 4000 3996 unsigned long limit; 4001 - unsigned int i; 4002 3997 4003 3998 udp_table_init(&udp_table, "UDP"); 4004 3999 limit = nr_free_buffer_pages() / 8; ··· 4005 4002 sysctl_udp_mem[0] = limit / 4 * 3; 4006 4003 sysctl_udp_mem[1] = limit; 4007 4004 sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; 4008 - 4009 - /* 16 spinlocks per cpu */ 4010 - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; 4011 - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, 4012 - GFP_KERNEL); 4013 - if (!udp_busylocks) 4014 - panic("UDP: failed to alloc udp_busylocks\n"); 4015 - for (i = 0; i < (1U << udp_busylocks_log); i++) 4016 - spin_lock_init(udp_busylocks + i); 4017 4005 4018 4006 if (register_pernet_subsys(&udp_sysctl_ops)) 4019 4007 panic("UDP: failed to init sysctl parameters.\n");
+1 -1
net/ipv6/af_inet6.c
··· 857 857 return PTR_ERR(dst); 858 858 } 859 859 860 - ip6_dst_store(sk, dst, NULL, NULL); 860 + ip6_dst_store(sk, dst, false, false); 861 861 } 862 862 863 863 return 0;
+1 -1
net/ipv6/inet6_connection_sock.c
··· 91 91 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); 92 92 93 93 if (!IS_ERR(dst)) 94 - ip6_dst_store(sk, dst, NULL, NULL); 94 + ip6_dst_store(sk, dst, false, false); 95 95 } 96 96 return dst; 97 97 }
+4 -2
net/ipv6/ip6_output.c
··· 1100 1100 * sockets. 1101 1101 * 2. oif also should be the same. 1102 1102 */ 1103 - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1103 + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1104 + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1104 1105 #ifdef CONFIG_IPV6_SUBTREES 1105 - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1106 + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1107 + np->saddr_cache ? &np->saddr : NULL) || 1106 1108 #endif 1107 1109 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1108 1110 dst_release(dst);
+1 -1
net/ipv6/raw.c
··· 445 445 if (flags & MSG_ERRQUEUE) 446 446 return ipv6_recv_error(sk, msg, len, addr_len); 447 447 448 - if (np->rxpmtu && np->rxopt.bits.rxpmtu) 448 + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) 449 449 return ipv6_recv_rxpmtu(sk, msg, len, addr_len); 450 450 451 451 skb = skb_recv_datagram(sk, flags, &err);
+3 -4
net/ipv6/route.c
··· 3032 3032 #endif 3033 3033 3034 3034 ip6_dst_store(sk, dst, 3035 - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 3036 - &sk->sk_v6_daddr : NULL, 3035 + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), 3037 3036 #ifdef CONFIG_IPV6_SUBTREES 3038 3037 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 3039 - &np->saddr : 3038 + true : 3040 3039 #endif 3041 - NULL); 3040 + false); 3042 3041 } 3043 3042 3044 3043 static bool ip6_redirect_nh_match(const struct fib6_result *res,
+2 -2
net/ipv6/tcp_ipv6.c
··· 299 299 inet->inet_rcv_saddr = LOOPBACK4_IPV6; 300 300 301 301 sk->sk_gso_type = SKB_GSO_TCPV6; 302 - ip6_dst_store(sk, dst, NULL, NULL); 302 + ip6_dst_store(sk, dst, false, false); 303 303 304 304 icsk->icsk_ext_hdr_len = 0; 305 305 if (opt) ··· 1459 1459 1460 1460 memcpy(newnp, np, sizeof(struct ipv6_pinfo)); 1461 1461 1462 - ip6_dst_store(newsk, dst, NULL, NULL); 1462 + ip6_dst_store(newsk, dst, false, false); 1463 1463 1464 1464 newnp->saddr = ireq->ir_v6_loc_addr; 1465 1465
+4 -4
net/ipv6/udp.c
··· 479 479 if (flags & MSG_ERRQUEUE) 480 480 return ipv6_recv_error(sk, msg, len, addr_len); 481 481 482 - if (np->rxpmtu && np->rxopt.bits.rxpmtu) 482 + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) 483 483 return ipv6_recv_rxpmtu(sk, msg, len, addr_len); 484 484 485 485 try_again: ··· 524 524 } 525 525 if (unlikely(err)) { 526 526 if (!peeking) { 527 - sk_drops_inc(sk); 527 + udp_drops_inc(sk); 528 528 SNMP_INC_STATS(mib, UDP_MIB_INERRORS); 529 529 } 530 530 kfree_skb(skb); ··· 908 908 __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 909 909 drop: 910 910 __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 911 - sk_drops_inc(sk); 911 + udp_drops_inc(sk); 912 912 sk_skb_reason_drop(sk, skb, drop_reason); 913 913 return -1; 914 914 } ··· 1013 1013 } 1014 1014 nskb = skb_clone(skb, GFP_ATOMIC); 1015 1015 if (unlikely(!nskb)) { 1016 - sk_drops_inc(sk); 1016 + udp_drops_inc(sk); 1017 1017 __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, 1018 1018 IS_UDPLITE(sk)); 1019 1019 __UDP6_INC_STATS(net, UDP_MIB_INERRORS,