Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tcp: Update bind bucket state on port release

Today, once an inet_bind_bucket enters a state where fastreuse >= 0 or
fastreuseport >= 0 after a socket is explicitly bound to a port, it remains
in that state until all sockets are removed and the bucket is destroyed.

In this state, the bucket is skipped during ephemeral port selection in
connect(). For applications using a reduced ephemeral port
range (IP_LOCAL_PORT_RANGE socket option), this can cause faster port
exhaustion since blocked buckets are excluded from reuse.

The reason the bucket state isn't updated on port release is unclear.
Possibly a performance trade-off to avoid scanning bucket owners, or just
an oversight.

Fix it by recalculating the bucket state when a socket releases a port. To
limit overhead, each inet_bind2_bucket stores its own (fastreuse,
fastreuseport) state. On port release, only the relevant port-addr bucket
is scanned, and the overall state is derived from these.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250917-update-bind-bucket-state-on-unhash-v5-1-57168b661b47@cloudflare.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Jakub Sitnicki and committed by
Paolo Abeni
d57f4b87 3afb106f

+63 -8
+3 -2
include/net/inet_connection_sock.h
··· 316 316 void inet_csk_listen_stop(struct sock *sk); 317 317 318 318 /* update the fast reuse flag when adding a socket */ 319 - void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, 320 - struct sock *sk); 319 + void inet_csk_update_fastreuse(const struct sock *sk, 320 + struct inet_bind_bucket *tb, 321 + struct inet_bind2_bucket *tb2); 321 322 322 323 struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); 323 324
+2
include/net/inet_hashtables.h
··· 108 108 struct hlist_node bhash_node; 109 109 /* List of sockets hashed to this bucket */ 110 110 struct hlist_head owners; 111 + signed char fastreuse; 112 + signed char fastreuseport; 111 113 }; 112 114 113 115 static inline struct net *ib_net(const struct inet_bind_bucket *ib)
+2 -1
include/net/inet_timewait_sock.h
··· 70 70 unsigned int tw_transparent : 1, 71 71 tw_flowlabel : 20, 72 72 tw_usec_ts : 1, 73 - tw_pad : 2, /* 2 bits hole */ 73 + tw_connect_bind : 1, 74 + tw_pad : 1, /* 1 bit hole */ 74 75 tw_tos : 8; 75 76 u32 tw_txhash; 76 77 u32 tw_priority;
+4
include/net/sock.h
··· 1494 1494 1495 1495 #define SOCK_BINDADDR_LOCK 4 1496 1496 #define SOCK_BINDPORT_LOCK 8 1497 + /** 1498 + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time 1499 + */ 1500 + #define SOCK_CONNECT_BIND 16 1497 1501 1498 1502 struct socket_alloc { 1499 1503 struct socket socket;
+8 -4
net/ipv4/inet_connection_sock.c
··· 423 423 } 424 424 425 425 static inline int sk_reuseport_match(struct inet_bind_bucket *tb, 426 - struct sock *sk) 426 + const struct sock *sk) 427 427 { 428 428 if (tb->fastreuseport <= 0) 429 429 return 0; ··· 453 453 ipv6_only_sock(sk), true, false); 454 454 } 455 455 456 - void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, 457 - struct sock *sk) 456 + void inet_csk_update_fastreuse(const struct sock *sk, 457 + struct inet_bind_bucket *tb, 458 + struct inet_bind2_bucket *tb2) 458 459 { 459 460 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; 460 461 ··· 502 501 tb->fastreuseport = 0; 503 502 } 504 503 } 504 + 505 + tb2->fastreuse = tb->fastreuse; 506 + tb2->fastreuseport = tb->fastreuseport; 505 507 } 506 508 507 509 /* Obtain a reference to a local port for the given sock, ··· 586 582 } 587 583 588 584 success: 589 - inet_csk_update_fastreuse(tb, sk); 585 + inet_csk_update_fastreuse(sk, tb, tb2); 590 586 591 587 if (!inet_csk(sk)->icsk_bind_hash) 592 588 inet_bind_hash(sk, tb, tb2, port);
+43 -1
net/ipv4/inet_hashtables.c
··· 58 58 sk->sk_daddr, sk->sk_dport); 59 59 } 60 60 61 + static bool sk_is_connect_bind(const struct sock *sk) 62 + { 63 + if (sk->sk_state == TCP_TIME_WAIT) 64 + return inet_twsk(sk)->tw_connect_bind; 65 + else 66 + return sk->sk_userlocks & SOCK_CONNECT_BIND; 67 + } 68 + 61 69 /* 62 70 * Allocate and initialize a new local port bind bucket. 63 71 * The bindhash mutex for snum's hash chain must be held here. ··· 95 87 */ 96 88 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) 97 89 { 90 + const struct inet_bind2_bucket *tb2; 91 + 98 92 if (hlist_empty(&tb->bhash2)) { 99 93 hlist_del_rcu(&tb->node); 100 94 kfree_rcu(tb, rcu); 95 + return; 101 96 } 97 + 98 + if (tb->fastreuse == -1 && tb->fastreuseport == -1) 99 + return; 100 + hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { 101 + if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) 102 + return; 103 + } 104 + tb->fastreuse = -1; 105 + tb->fastreuseport = -1; 102 106 } 103 107 104 108 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, ··· 141 121 #else 142 122 tb2->rcv_saddr = sk->sk_rcv_saddr; 143 123 #endif 124 + tb2->fastreuse = 0; 125 + tb2->fastreuseport = 0; 144 126 INIT_HLIST_HEAD(&tb2->owners); 145 127 hlist_add_head(&tb2->node, &head->chain); 146 128 hlist_add_head(&tb2->bhash_node, &tb->bhash2); ··· 165 143 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 166 144 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 167 145 { 146 + const struct sock *sk; 147 + 168 148 if (hlist_empty(&tb->owners)) { 169 149 __hlist_del(&tb->node); 170 150 __hlist_del(&tb->bhash_node); 171 151 kmem_cache_free(cachep, tb); 152 + return; 172 153 } 154 + 155 + if (tb->fastreuse == -1 && tb->fastreuseport == -1) 156 + return; 157 + sk_for_each_bound(sk, &tb->owners) { 158 + if (!sk_is_connect_bind(sk)) 159 + return; 160 + } 161 + tb->fastreuse = -1; 162 + tb->fastreuseport = -1; 173 163 } 174 164 175 165 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, ··· 225 191 tb = inet_csk(sk)->icsk_bind_hash; 226 192 inet_csk(sk)->icsk_bind_hash = NULL; 227 193 inet_sk(sk)->inet_num = 0; 194 + sk->sk_userlocks &= ~SOCK_CONNECT_BIND; 228 195 229 196 spin_lock(&head2->lock); 230 197 if (inet_csk(sk)->icsk_bind2_hash) { ··· 312 277 } 313 278 } 314 279 if (update_fastreuse) 315 - inet_csk_update_fastreuse(tb, child); 280 + inet_csk_update_fastreuse(child, tb, tb2); 316 281 inet_bind_hash(child, tb, tb2, port); 317 282 spin_unlock(&head2->lock); 318 283 spin_unlock(&head->lock); ··· 985 950 if (!tb2) { 986 951 tb2 = new_tb2; 987 952 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); 953 + if (sk_is_connect_bind(sk)) { 954 + tb2->fastreuse = -1; 955 + tb2->fastreuseport = -1; 956 + } 988 957 } 989 958 inet_csk(sk)->icsk_bind2_hash = tb2; 990 959 sk_add_bind_node(sk, &tb2->owners); ··· 1159 1120 head2, tb, sk); 1160 1121 if (!tb2) 1161 1122 goto error; 1123 + tb2->fastreuse = -1; 1124 + tb2->fastreuseport = -1; 1162 1125 } 1163 1126 1164 1127 /* Here we want to add a little bit of randomness to the next source ··· 1173 1132 1174 1133 /* Head lock still held and bh's disabled */ 1175 1134 inet_bind_hash(sk, tb, tb2, port); 1135 + sk->sk_userlocks |= SOCK_CONNECT_BIND; 1176 1136 1177 1137 if (sk_unhashed(sk)) { 1178 1138 inet_sk(sk)->inet_sport = htons(port);
+1
net/ipv4/inet_timewait_sock.c
··· 208 208 tw->tw_hash = sk->sk_hash; 209 209 tw->tw_ipv6only = 0; 210 210 tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); 211 + tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); 211 212 tw->tw_prot = sk->sk_prot_creator; 212 213 atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); 213 214 twsk_net_set(tw, sock_net(sk));