Merge branch 'tcp-fix-receive-autotune-again'

Matthieu Baerts says:

====================
tcp: fix receive autotune again

Neal Cardwell found that recent kernels were having RWIN limited
issues, even when net.ipv4.tcp_rmem[2] was set to a very big value like
512MB.

He suspected that tcp_stream default buffer size (64KB) was triggering
heuristic added in ea33537d8292 ("tcp: add receive queue awareness
in tcp_rcv_space_adjust()").

After more testing, it turns out the bug was added earlier
with commit 65c5287892e9 ("tcp: fix sk_rcvbuf overshoot").

I forgot once again that DRS has one RTT latency.

MPTCP also got the same issue.

This series :
- Prevents calling tcp_rcvbuf_grow() on some MPTCP subflows.
- adds rcv_ssthresh, window_clamp and rcv_wnd to trace_tcp_rcvbuf_grow().
- Refactors code in a patch with no functional changes.
- Fixes the issue in the final patch.
====================

Link: https://patch.msgid.link/20251028-net-tcp-recv-autotune-v3-0-74b43ba4c84c@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Jakub Kicinski 7 months ago bcc843bb f99c5792

+41 -17

4 changed files

expand all

include

net

tcp.h

trace

events

tcp.h

net

ipv4

tcp_input.c

mptcp

protocol.c

+1 -1

include/net/tcp.h

··· 370 370 int tcp_ioctl(struct sock *sk, int cmd, int *karg); 371 371 enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); 372 372 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); 373 - void tcp_rcvbuf_grow(struct sock *sk); 373 + void tcp_rcvbuf_grow(struct sock *sk, u32 newval); 374 374 void tcp_rcv_space_adjust(struct sock *sk); 375 375 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); 376 376 void tcp_twsk_destructor(struct sock *sk);

include/trace/events/tcp.h

··· 218 218 __field(__u32, space) 219 219 __field(__u32, ooo_space) 220 220 __field(__u32, rcvbuf) 221 + __field(__u32, rcv_ssthresh) 222 + __field(__u32, window_clamp) 223 + __field(__u32, rcv_wnd) 221 224 __field(__u8, scaling_ratio) 222 225 __field(__u16, sport) 223 226 __field(__u16, dport) ··· 248 245 tp->rcv_nxt; 249 246 250 247 __entry->rcvbuf = sk->sk_rcvbuf; 248 + __entry->rcv_ssthresh = tp->rcv_ssthresh; 249 + __entry->window_clamp = tp->window_clamp; 250 + __entry->rcv_wnd = tp->rcv_wnd; 251 251 __entry->scaling_ratio = tp->scaling_ratio; 252 252 __entry->sport = ntohs(inet->inet_sport); 253 253 __entry->dport = ntohs(inet->inet_dport); ··· 270 264 ), 271 265 272 266 TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u " 267 + "rcv_ssthresh=%u window_clamp=%u rcv_wnd=%u " 273 268 "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 " 274 269 "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx", 275 270 __entry->time, __entry->rtt_us, __entry->copied, 276 271 __entry->inq, __entry->space, __entry->ooo_space, 277 272 __entry->scaling_ratio, __entry->rcvbuf, 273 + __entry->rcv_ssthresh, __entry->window_clamp, 274 + __entry->rcv_wnd, 278 275 show_family_name(__entry->family), 279 276 __entry->sport, __entry->dport, 280 277 __entry->saddr, __entry->daddr,

+14 -7

net/ipv4/tcp_input.c

··· 891 891 } 892 892 } 893 893 894 - void tcp_rcvbuf_grow(struct sock *sk) 894 + void tcp_rcvbuf_grow(struct sock *sk, u32 newval) 895 895 { 896 896 const struct net *net = sock_net(sk); 897 897 struct tcp_sock *tp = tcp_sk(sk); 898 - int rcvwin, rcvbuf, cap; 898 + u32 rcvwin, rcvbuf, cap, oldval; 899 + u64 grow; 900 + 901 + oldval = tp->rcvq_space.space; 902 + tp->rcvq_space.space = newval; 899 903 900 904 if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || 901 905 (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 902 906 return; 903 907 908 + /* DRS is always one RTT late. */ 909 + rcvwin = newval << 1; 910 + 904 911 /* slow start: allow the sender to double its rate. */ 905 - rcvwin = tp->rcvq_space.space << 1; 912 + grow = (u64)rcvwin * (newval - oldval); 913 + do_div(grow, oldval); 914 + rcvwin += grow << 1; 906 915 907 916 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) 908 917 rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; ··· 952 943 953 944 trace_tcp_rcvbuf_grow(sk, time); 954 945 955 - tp->rcvq_space.space = copied; 956 - 957 - tcp_rcvbuf_grow(sk); 946 + tcp_rcvbuf_grow(sk, copied); 958 947 959 948 new_measure: 960 949 tp->rcvq_space.seq = tp->copied_seq; ··· 5277 5270 } 5278 5271 /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ 5279 5272 if (sk->sk_socket) 5280 - tcp_rcvbuf_grow(sk); 5273 + tcp_rcvbuf_grow(sk, tp->rcvq_space.space); 5281 5274 } 5282 5275 5283 5276 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,

+17 -9

net/mptcp/protocol.c

··· 194 194 * - mptcp does not maintain a msk-level window clamp 195 195 * - returns true when the receive buffer is actually updated 196 196 */ 197 - static bool mptcp_rcvbuf_grow(struct sock *sk) 197 + static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) 198 198 { 199 199 struct mptcp_sock *msk = mptcp_sk(sk); 200 200 const struct net *net = sock_net(sk); 201 - int rcvwin, rcvbuf, cap; 201 + u32 rcvwin, rcvbuf, cap, oldval; 202 + u64 grow; 202 203 204 + oldval = msk->rcvq_space.space; 205 + msk->rcvq_space.space = newval; 203 206 if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || 204 207 (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 205 208 return false; 206 209 207 - rcvwin = msk->rcvq_space.space << 1; 210 + /* DRS is always one RTT late. */ 211 + rcvwin = newval << 1; 212 + 213 + /* slow start: allow the sender to double its rate. */ 214 + grow = (u64)rcvwin * (newval - oldval); 215 + do_div(grow, oldval); 216 + rcvwin += grow << 1; 208 217 209 218 if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) 210 219 rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; ··· 343 334 skb_set_owner_r(skb, sk); 344 335 /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ 345 336 if (sk->sk_socket) 346 - mptcp_rcvbuf_grow(sk); 337 + mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); 347 338 } 348 339 349 340 static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, ··· 2058 2049 if (msk->rcvq_space.copied <= msk->rcvq_space.space) 2059 2050 goto new_measure; 2060 2051 2061 - msk->rcvq_space.space = msk->rcvq_space.copied; 2062 - if (mptcp_rcvbuf_grow(sk)) { 2063 - 2052 + if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { 2064 2053 /* Make subflows follow along. If we do not do this, we 2065 2054 * get drops at subflow level if skbs can't be moved to 2066 2055 * the mptcp rx queue fast enough (announced rcv_win can ··· 2070 2063 2071 2064 ssk = mptcp_subflow_tcp_sock(subflow); 2072 2065 slow = lock_sock_fast(ssk); 2073 - tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; 2074 - tcp_rcvbuf_grow(ssk); 2066 + /* subflows can be added before tcp_init_transfer() */ 2067 + if (tcp_sk(ssk)->rcvq_space.space) 2068 + tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied); 2075 2069 unlock_sock_fast(ssk, slow); 2076 2070 } 2077 2071 }

Configure Feed

Configure Feed