Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'inetpeer-reduce-false-sharing-and-atomic-operations'

Eric Dumazet says:

====================
inetpeer: reduce false sharing and atomic operations

After commit 8c2bd38b95f7 ("icmp: change the order of rate limits"),
there is a risk that a host receiving packets from an unique
source targeting closed ports is using a common inet_peer structure
from many cpus.

All these cpus have to acquire/release a refcount and update
the inet_peer timestamp (p->dtime)

Switch to pure RCU to avoid changing the refcount, and update
p->dtime only once per jiffy.

Tested:
DUT : 128 cores, 32 hw rx queues.
receiving 8,400,000 UDP packets per second, targeting closed ports.

Before the series:
- napi poll can not keep up, NIC drops 1,200,000 packets
per second.
- We use 20 % of cpu cycles

After this series:
- All packets are received (no more hw drops)
- We use 12 % of cpu cycles.

v1: https://lore.kernel.org/20241213130212.1783302-1-edumazet@google.com
====================

Link: https://patch.msgid.link/20241215175629.1248773-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+47 -57
+5 -7
include/net/inetpeer.h
··· 96 96 97 97 /* can be called with or without local BH being disabled */ 98 98 struct inet_peer *inet_getpeer(struct inet_peer_base *base, 99 - const struct inetpeer_addr *daddr, 100 - int create); 99 + const struct inetpeer_addr *daddr); 101 100 102 101 static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, 103 102 __be32 v4daddr, 104 - int vif, int create) 103 + int vif) 105 104 { 106 105 struct inetpeer_addr daddr; 107 106 108 107 daddr.a4.addr = v4daddr; 109 108 daddr.a4.vif = vif; 110 109 daddr.family = AF_INET; 111 - return inet_getpeer(base, &daddr, create); 110 + return inet_getpeer(base, &daddr); 112 111 } 113 112 114 113 static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, 115 - const struct in6_addr *v6daddr, 116 - int create) 114 + const struct in6_addr *v6daddr) 117 115 { 118 116 struct inetpeer_addr daddr; 119 117 120 118 daddr.a6 = *v6daddr; 121 119 daddr.family = AF_INET6; 122 - return inet_getpeer(base, &daddr, create); 120 + return inet_getpeer(base, &daddr); 123 121 } 124 122 125 123 static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
+4 -5
net/ipv4/icmp.c
··· 312 312 struct dst_entry *dst = &rt->dst; 313 313 struct inet_peer *peer; 314 314 bool rc = true; 315 - int vif; 316 315 317 316 if (!apply_ratelimit) 318 317 return true; ··· 320 321 if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) 321 322 goto out; 322 323 323 - vif = l3mdev_master_ifindex(dst->dev); 324 - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); 324 + rcu_read_lock(); 325 + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 326 + l3mdev_master_ifindex_rcu(dst->dev)); 325 327 rc = inet_peer_xrlim_allow(peer, 326 328 READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); 327 - if (peer) 328 - inet_putpeer(peer); 329 + rcu_read_unlock(); 329 330 out: 330 331 if (!rc) 331 332 __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+8 -23
net/ipv4/inetpeer.c
··· 95 95 { 96 96 struct rb_node **pp, *parent, *next; 97 97 struct inet_peer *p; 98 + u32 now; 98 99 99 100 pp = &base->rb_root.rb_node; 100 101 parent = NULL; ··· 109 108 p = rb_entry(parent, struct inet_peer, rb_node); 110 109 cmp = inetpeer_addr_cmp(daddr, &p->daddr); 111 110 if (cmp == 0) { 112 - if (!refcount_inc_not_zero(&p->refcnt)) 113 - break; 111 + now = jiffies; 112 + if (READ_ONCE(p->dtime) != now) 113 + WRITE_ONCE(p->dtime, now); 114 114 return p; 115 115 } 116 116 if (gc_stack) { ··· 152 150 for (i = 0; i < gc_cnt; i++) { 153 151 p = gc_stack[i]; 154 152 155 - /* The READ_ONCE() pairs with the WRITE_ONCE() 156 - * in inet_putpeer() 157 - */ 158 153 delta = (__u32)jiffies - READ_ONCE(p->dtime); 159 154 160 155 if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) ··· 167 168 } 168 169 } 169 170 171 + /* Must be called under RCU : No refcount change is done here. */ 170 172 struct inet_peer *inet_getpeer(struct inet_peer_base *base, 171 - const struct inetpeer_addr *daddr, 172 - int create) 173 + const struct inetpeer_addr *daddr) 173 174 { 174 175 struct inet_peer *p, *gc_stack[PEER_MAX_GC]; 175 176 struct rb_node **pp, *parent; 176 177 unsigned int gc_cnt, seq; 177 - int invalidated; 178 178 179 179 /* Attempt a lockless lookup first. 180 180 * Because of a concurrent writer, we might not find an existing entry. 181 181 */ 182 - rcu_read_lock(); 183 182 seq = read_seqbegin(&base->lock); 184 183 p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); 185 - invalidated = read_seqretry(&base->lock, seq); 186 - rcu_read_unlock(); 187 184 188 185 if (p) 189 186 return p; 190 - 191 - /* If no writer did a change during our lookup, we can return early. */ 192 - if (!create && !invalidated) 193 - return NULL; 194 187 195 188 /* retry an exact lookup, taking the lock before. 196 189 * At least, nodes should be hot in our cache. ··· 192 201 193 202 gc_cnt = 0; 194 203 p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); 195 - if (!p && create) { 204 + if (!p) { 196 205 p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); 197 206 if (p) { 198 207 p->daddr = *daddr; 199 208 p->dtime = (__u32)jiffies; 200 - refcount_set(&p->refcnt, 2); 209 + refcount_set(&p->refcnt, 1); 201 210 atomic_set(&p->rid, 0); 202 211 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 203 212 p->rate_tokens = 0; ··· 222 231 223 232 void inet_putpeer(struct inet_peer *p) 224 233 { 225 - /* The WRITE_ONCE() pairs with itself (we run lockless) 226 - * and the READ_ONCE() in inet_peer_gc() 227 - */ 228 - WRITE_ONCE(p->dtime, (__u32)jiffies); 229 - 230 234 if (refcount_dec_and_test(&p->refcnt)) 231 235 kfree_rcu(p, rcu); 232 236 } 233 - EXPORT_SYMBOL_GPL(inet_putpeer); 234 237 235 238 /* 236 239 * Check transmit rate limitation for given message.
+10 -5
net/ipv4/ip_fragment.c
··· 82 82 static void ip4_frag_init(struct inet_frag_queue *q, const void *a) 83 83 { 84 84 struct ipq *qp = container_of(q, struct ipq, q); 85 - struct net *net = q->fqdir->net; 86 - 87 85 const struct frag_v4_compare_key *key = a; 86 + struct net *net = q->fqdir->net; 87 + struct inet_peer *p = NULL; 88 88 89 89 q->key.v4 = *key; 90 90 qp->ecn = 0; 91 - qp->peer = q->fqdir->max_dist ? 92 - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : 93 - NULL; 91 + if (q->fqdir->max_dist) { 92 + rcu_read_lock(); 93 + p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); 94 + if (p && !refcount_inc_not_zero(&p->refcnt)) 95 + p = NULL; 96 + rcu_read_unlock(); 97 + } 98 + qp->peer = p; 94 99 } 95 100 96 101 static void ip4_frag_free(struct inet_frag_queue *q)
+9 -8
net/ipv4/route.c
··· 870 870 } 871 871 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 872 872 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 873 - rcu_read_unlock(); 874 873 875 874 net = dev_net(rt->dst.dev); 876 - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 875 + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); 877 876 if (!peer) { 877 + rcu_read_unlock(); 878 878 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 879 879 rt_nexthop(rt, ip_hdr(skb)->daddr)); 880 880 return; ··· 893 893 */ 894 894 if (peer->n_redirects >= ip_rt_redirect_number) { 895 895 peer->rate_last = jiffies; 896 - goto out_put_peer; 896 + goto out_unlock; 897 897 } 898 898 899 899 /* Check for load limit; set rate_last to the latest sent ··· 914 914 &ip_hdr(skb)->saddr, inet_iif(skb), 915 915 &ip_hdr(skb)->daddr, &gw); 916 916 } 917 - out_put_peer: 918 - inet_putpeer(peer); 917 + out_unlock: 918 + rcu_read_unlock(); 919 919 } 920 920 921 921 static int ip_error(struct sk_buff *skb) ··· 975 975 break; 976 976 } 977 977 978 + rcu_read_lock(); 978 979 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 979 - l3mdev_master_ifindex(skb->dev), 1); 980 - 980 + l3mdev_master_ifindex_rcu(skb->dev)); 981 981 send = true; 982 982 if (peer) { 983 983 now = jiffies; ··· 989 989 peer->rate_tokens -= ip_rt_error_cost; 990 990 else 991 991 send = false; 992 - inet_putpeer(peer); 993 992 } 993 + rcu_read_unlock(); 994 + 994 995 if (send) 995 996 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 996 997
+3 -3
net/ipv6/icmp.c
··· 222 222 if (rt->rt6i_dst.plen < 128) 223 223 tmo >>= ((128 - rt->rt6i_dst.plen)>>5); 224 224 225 - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); 225 + rcu_read_lock(); 226 + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); 226 227 res = inet_peer_xrlim_allow(peer, tmo); 227 - if (peer) 228 - inet_putpeer(peer); 228 + rcu_read_unlock(); 229 229 } 230 230 if (!res) 231 231 __ICMP6_INC_STATS(net, ip6_dst_idev(dst),
+3 -3
net/ipv6/ip6_output.c
··· 613 613 else 614 614 target = &hdr->daddr; 615 615 616 - peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 616 + rcu_read_lock(); 617 + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 617 618 618 619 /* Limit redirects both by destination (here) 619 620 and by source (inside ndisc_send_redirect) 620 621 */ 621 622 if (inet_peer_xrlim_allow(peer, 1*HZ)) 622 623 ndisc_send_redirect(skb, target); 623 - if (peer) 624 - inet_putpeer(peer); 624 + rcu_read_unlock(); 625 625 } else { 626 626 int addrtype = ipv6_addr_type(&hdr->saddr); 627 627
+5 -3
net/ipv6/ndisc.c
··· 1731 1731 "Redirect: destination is not a neighbour\n"); 1732 1732 goto release; 1733 1733 } 1734 - peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); 1734 + 1735 + rcu_read_lock(); 1736 + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); 1735 1737 ret = inet_peer_xrlim_allow(peer, 1*HZ); 1736 - if (peer) 1737 - inet_putpeer(peer); 1738 + rcu_read_unlock(); 1739 + 1738 1740 if (!ret) 1739 1741 goto release; 1740 1742