Merge branch 'udp-increase-rx-performance-under-stress'

+19 -20

include/linux/ipv6.h

··· 214 214 215 215 /* struct ipv6_pinfo - ipv6 private area */ 216 216 struct ipv6_pinfo { 217 + /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ 217 218 struct in6_addr saddr; 218 - struct in6_pktinfo sticky_pktinfo; 219 - const struct in6_addr *daddr_cache; 220 - #ifdef CONFIG_IPV6_SUBTREES 221 - const struct in6_addr *saddr_cache; 222 - #endif 223 - 224 219 __be32 flow_label; 225 - __u32 frag_size; 226 - 220 + u32 dst_cookie; 221 + struct ipv6_txoptions __rcu *opt; 227 222 s16 hop_limit; 223 + u8 pmtudisc; 224 + u8 tclass; 225 + #ifdef CONFIG_IPV6_SUBTREES 226 + bool saddr_cache; 227 + #endif 228 + bool daddr_cache; 229 + 228 230 u8 mcast_hops; 231 + u32 frag_size; 229 232 230 233 int ucast_oif; 231 234 int mcast_oif; ··· 236 233 /* pktoption flags */ 237 234 union { 238 235 struct { 239 - __u16 srcrt:1, 236 + u16 srcrt:1, 240 237 osrcrt:1, 241 238 rxinfo:1, 242 239 rxoinfo:1, ··· 253 250 recvfragsize:1; 254 251 /* 1 bits hole */ 255 252 } bits; 256 - __u16 all; 253 + u16 all; 257 254 } rxopt; 258 255 259 256 /* sockopt flags */ 260 - __u8 srcprefs; /* 001: prefer temporary address 257 + u8 srcprefs; /* 001: prefer temporary address 261 258 * 010: prefer public address 262 259 * 100: prefer care-of address 263 260 */ 264 - __u8 pmtudisc; 265 - __u8 min_hopcount; 266 - __u8 tclass; 261 + u8 min_hopcount; 267 262 __be32 rcv_flowinfo; 263 + struct in6_pktinfo sticky_pktinfo; 268 264 269 - __u32 dst_cookie; 265 + struct sk_buff *pktoptions; 266 + struct sk_buff *rxpmtu; 267 + struct inet6_cork cork; 270 268 271 269 struct ipv6_mc_socklist __rcu *ipv6_mc_list; 272 270 struct ipv6_ac_socklist *ipv6_ac_list; 273 271 struct ipv6_fl_socklist __rcu *ipv6_fl_list; 274 - 275 - struct ipv6_txoptions __rcu *opt; 276 - struct sk_buff *pktoptions; 277 - struct sk_buff *rxpmtu; 278 - struct inet6_cork cork; 279 272 }; 280 273 281 274 /* We currently use available bits from inet_sk(sk)->inet_flags,

+1

include/linux/udp.h

··· 109 109 */ 110 110 struct hlist_node tunnel_list; 111 111 struct numa_drop_counters drop_counters; 112 + spinlock_t busylock ____cacheline_aligned_in_smp; 112 113 }; 113 114 114 115 #define udp_test_bit(nr, sk) \

+4 -4

include/net/ip6_route.h

··· 229 229 * Store a destination cache entry in a socket 230 230 */ 231 231 static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, 232 - const struct in6_addr *daddr, 233 - const struct in6_addr *saddr) 232 + bool daddr_set, 233 + bool saddr_set) 234 234 { 235 235 struct ipv6_pinfo *np = inet6_sk(sk); 236 236 237 237 np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); 238 238 sk_setup_caps(sk, dst); 239 - np->daddr_cache = daddr; 239 + np->daddr_cache = daddr_set; 240 240 #ifdef CONFIG_IPV6_SUBTREES 241 - np->saddr_cache = saddr; 241 + np->saddr_cache = saddr_set; 242 242 #endif 243 243 } 244 244

+2 -2

include/net/sock.h

··· 394 394 395 395 atomic_t sk_drops; 396 396 __s32 sk_peek_off; 397 - struct sk_buff_head sk_error_queue; 398 397 struct sk_buff_head sk_receive_queue; 399 398 /* 400 399 * The backlog queue is special, it is always used with ··· 411 412 } sk_backlog; 412 413 #define sk_rmem_alloc sk_backlog.rmem_alloc 413 414 415 + struct sk_buff_head sk_error_queue; 414 416 __cacheline_group_end(sock_write_rx); 415 417 416 418 __cacheline_group_begin(sock_read_rx); ··· 451 451 #ifdef CONFIG_XFRM 452 452 struct xfrm_policy __rcu *sk_policy[2]; 453 453 #endif 454 - struct numa_drop_counters *sk_drop_counters; 455 454 __cacheline_group_end(sock_read_rxtx); 456 455 457 456 __cacheline_group_begin(sock_write_rxtx); ··· 567 568 #ifdef CONFIG_BPF_SYSCALL 568 569 struct bpf_local_storage __rcu *sk_bpf_storage; 569 570 #endif 571 + struct numa_drop_counters *sk_drop_counters; 570 572 struct rcu_head sk_rcu; 571 573 netns_tracker ns_tracker; 572 574 struct xarray sk_user_frags;

+6

include/net/udp.h

··· 289 289 struct udp_sock *up = udp_sk(sk); 290 290 291 291 sk->sk_drop_counters = &up->drop_counters; 292 + spin_lock_init(&up->busylock); 292 293 skb_queue_head_init(&up->reader_queue); 293 294 INIT_HLIST_NODE(&up->tunnel_list); 294 295 up->forward_threshold = sk->sk_rcvbuf >> 2; 295 296 set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 297 + } 298 + 299 + static inline void udp_drops_inc(struct sock *sk) 300 + { 301 + numa_drop_add(&udp_sk(sk)->drop_counters, 1); 296 302 } 297 303 298 304 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */

-1

net/core/sock.c

··· 4444 4444 #ifdef CONFIG_MEMCG 4445 4445 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4446 4446 #endif 4447 - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); 4448 4447 4449 4448 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4450 4449 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);

+22 -28

net/ipv4/udp.c

··· 1689 1689 * to relieve pressure on the receive_queue spinlock shared by consumer. 1690 1690 * Under flood, this means that only one producer can be in line 1691 1691 * trying to acquire the receive_queue spinlock. 1692 - * These busylock can be allocated on a per cpu manner, instead of a 1693 - * per socket one (that would consume a cache line per socket) 1694 1692 */ 1695 - static int udp_busylocks_log __read_mostly; 1696 - static spinlock_t *udp_busylocks __read_mostly; 1697 - 1698 - static spinlock_t *busylock_acquire(void *ptr) 1693 + static spinlock_t *busylock_acquire(struct sock *sk) 1699 1694 { 1700 - spinlock_t *busy; 1695 + spinlock_t *busy = &udp_sk(sk)->busylock; 1701 1696 1702 - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); 1703 1697 spin_lock(busy); 1704 1698 return busy; 1705 1699 } ··· 1733 1739 if (rcvbuf > INT_MAX >> 1) 1734 1740 goto drop; 1735 1741 1736 - /* Always allow at least one packet for small buffer. */ 1737 - if (rmem > rcvbuf) 1742 + /* Accept the packet if queue is empty. */ 1743 + if (rmem) 1738 1744 goto drop; 1739 1745 } 1740 1746 ··· 1747 1753 if (rmem > (rcvbuf >> 1)) { 1748 1754 skb_condense(skb); 1749 1755 size = skb->truesize; 1756 + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); 1757 + if (rmem > rcvbuf) 1758 + goto uncharge_drop; 1750 1759 busy = busylock_acquire(sk); 1760 + } else { 1761 + atomic_add(size, &sk->sk_rmem_alloc); 1751 1762 } 1752 1763 1753 1764 udp_set_dev_scratch(skb); 1754 - 1755 - atomic_add(size, &sk->sk_rmem_alloc); 1756 1765 1757 1766 spin_lock(&list->lock); 1758 1767 err = udp_rmem_schedule(sk, size); ··· 1784 1787 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1785 1788 1786 1789 drop: 1787 - sk_drops_inc(sk); 1790 + udp_drops_inc(sk); 1788 1791 busylock_release(busy); 1789 1792 return err; 1790 1793 } ··· 1825 1828 if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) 1826 1829 sk_peek_offset_bwd(sk, len); 1827 1830 1831 + if (!skb_shared(skb)) { 1832 + if (unlikely(udp_skb_has_head_state(skb))) 1833 + skb_release_head_state(skb); 1834 + skb_attempt_defer_free(skb); 1835 + return; 1836 + } 1837 + 1828 1838 if (!skb_unref(skb)) 1829 1839 return; 1830 1840 ··· 1856 1852 IS_UDPLITE(sk)); 1857 1853 __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, 1858 1854 IS_UDPLITE(sk)); 1859 - sk_drops_inc(sk); 1855 + udp_drops_inc(sk); 1860 1856 __skb_unlink(skb, rcvq); 1861 1857 *total += skb->truesize; 1862 1858 kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); ··· 2012 2008 2013 2009 __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); 2014 2010 __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); 2015 - sk_drops_inc(sk); 2011 + udp_drops_inc(sk); 2016 2012 kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); 2017 2013 goto try_again; 2018 2014 } ··· 2082 2078 2083 2079 if (unlikely(err)) { 2084 2080 if (!peeking) { 2085 - sk_drops_inc(sk); 2081 + udp_drops_inc(sk); 2086 2082 UDP_INC_STATS(sock_net(sk), 2087 2083 UDP_MIB_INERRORS, is_udplite); 2088 2084 } ··· 2453 2449 __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 2454 2450 drop: 2455 2451 __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 2456 - sk_drops_inc(sk); 2452 + udp_drops_inc(sk); 2457 2453 sk_skb_reason_drop(sk, skb, drop_reason); 2458 2454 return -1; 2459 2455 } ··· 2538 2534 nskb = skb_clone(skb, GFP_ATOMIC); 2539 2535 2540 2536 if (unlikely(!nskb)) { 2541 - sk_drops_inc(sk); 2537 + udp_drops_inc(sk); 2542 2538 __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, 2543 2539 IS_UDPLITE(sk)); 2544 2540 __UDP_INC_STATS(net, UDP_MIB_INERRORS, ··· 3998 3994 void __init udp_init(void) 3999 3995 { 4000 3996 unsigned long limit; 4001 - unsigned int i; 4002 3997 4003 3998 udp_table_init(&udp_table, "UDP"); 4004 3999 limit = nr_free_buffer_pages() / 8; ··· 4005 4002 sysctl_udp_mem[0] = limit / 4 * 3; 4006 4003 sysctl_udp_mem[1] = limit; 4007 4004 sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; 4008 - 4009 - /* 16 spinlocks per cpu */ 4010 - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; 4011 - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, 4012 - GFP_KERNEL); 4013 - if (!udp_busylocks) 4014 - panic("UDP: failed to alloc udp_busylocks\n"); 4015 - for (i = 0; i < (1U << udp_busylocks_log); i++) 4016 - spin_lock_init(udp_busylocks + i); 4017 4005 4018 4006 if (register_pernet_subsys(&udp_sysctl_ops)) 4019 4007 panic("UDP: failed to init sysctl parameters.\n");

+1 -1

net/ipv6/af_inet6.c

··· 857 857 return PTR_ERR(dst); 858 858 } 859 859 860 - ip6_dst_store(sk, dst, NULL, NULL); 860 + ip6_dst_store(sk, dst, false, false); 861 861 } 862 862 863 863 return 0;

+1 -1

net/ipv6/inet6_connection_sock.c

··· 91 91 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); 92 92 93 93 if (!IS_ERR(dst)) 94 - ip6_dst_store(sk, dst, NULL, NULL); 94 + ip6_dst_store(sk, dst, false, false); 95 95 } 96 96 return dst; 97 97 }

+4 -2

net/ipv6/ip6_output.c

··· 1100 1100 * sockets. 1101 1101 * 2. oif also should be the same. 1102 1102 */ 1103 - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1103 + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1104 + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1104 1105 #ifdef CONFIG_IPV6_SUBTREES 1105 - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1106 + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1107 + np->saddr_cache ? &np->saddr : NULL) || 1106 1108 #endif 1107 1109 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1108 1110 dst_release(dst);

+1 -1

net/ipv6/raw.c

··· 445 445 if (flags & MSG_ERRQUEUE) 446 446 return ipv6_recv_error(sk, msg, len, addr_len); 447 447 448 - if (np->rxpmtu && np->rxopt.bits.rxpmtu) 448 + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) 449 449 return ipv6_recv_rxpmtu(sk, msg, len, addr_len); 450 450 451 451 skb = skb_recv_datagram(sk, flags, &err);

+3 -4

net/ipv6/route.c

··· 3032 3032 #endif 3033 3033 3034 3034 ip6_dst_store(sk, dst, 3035 - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? 3036 - &sk->sk_v6_daddr : NULL, 3035 + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), 3037 3036 #ifdef CONFIG_IPV6_SUBTREES 3038 3037 ipv6_addr_equal(&fl6->saddr, &np->saddr) ? 3039 - &np->saddr : 3038 + true : 3040 3039 #endif 3041 - NULL); 3040 + false); 3042 3041 } 3043 3042 3044 3043 static bool ip6_redirect_nh_match(const struct fib6_result *res,

+2 -2

net/ipv6/tcp_ipv6.c

··· 299 299 inet->inet_rcv_saddr = LOOPBACK4_IPV6; 300 300 301 301 sk->sk_gso_type = SKB_GSO_TCPV6; 302 - ip6_dst_store(sk, dst, NULL, NULL); 302 + ip6_dst_store(sk, dst, false, false); 303 303 304 304 icsk->icsk_ext_hdr_len = 0; 305 305 if (opt) ··· 1459 1459 1460 1460 memcpy(newnp, np, sizeof(struct ipv6_pinfo)); 1461 1461 1462 - ip6_dst_store(newsk, dst, NULL, NULL); 1462 + ip6_dst_store(newsk, dst, false, false); 1463 1463 1464 1464 newnp->saddr = ireq->ir_v6_loc_addr; 1465 1465

+4 -4

net/ipv6/udp.c

··· 479 479 if (flags & MSG_ERRQUEUE) 480 480 return ipv6_recv_error(sk, msg, len, addr_len); 481 481 482 - if (np->rxpmtu && np->rxopt.bits.rxpmtu) 482 + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) 483 483 return ipv6_recv_rxpmtu(sk, msg, len, addr_len); 484 484 485 485 try_again: ··· 524 524 } 525 525 if (unlikely(err)) { 526 526 if (!peeking) { 527 - sk_drops_inc(sk); 527 + udp_drops_inc(sk); 528 528 SNMP_INC_STATS(mib, UDP_MIB_INERRORS); 529 529 } 530 530 kfree_skb(skb); ··· 908 908 __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); 909 909 drop: 910 910 __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); 911 - sk_drops_inc(sk); 911 + udp_drops_inc(sk); 912 912 sk_skb_reason_drop(sk, skb, drop_reason); 913 913 return -1; 914 914 } ··· 1013 1013 } 1014 1014 nskb = skb_clone(skb, GFP_ATOMIC); 1015 1015 if (unlikely(!nskb)) { 1016 - sk_drops_inc(sk); 1016 + udp_drops_inc(sk); 1017 1017 __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, 1018 1018 IS_UDPLITE(sk)); 1019 1019 __UDP6_INC_STATS(net, UDP_MIB_INERRORS,

Configure Feed

Configure Feed