Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'tcp-receive-side-improvements'

Eric Dumazet says:

====================
tcp: receive side improvements

We have set tcp_rmem[2] to 15 MB for about 8 years at Google,
but had some issues for high speed flows on very small RTT.

TCP rx autotuning has a tendency to overestimate the RTT,
thus tp->rcvq_space.space and sk->sk_rcvbuf.

This makes TCP receive queues much bigger than necessary,
to a point cpu caches are evicted before application can
copy the data, on cpus using DDIO.

This series aims to fix this.

- First patch adds tcp_rcvbuf_grow() tracepoint, which was very
convenient to study the various issues fixed in this series.

- Seven patches fix receiver autotune issues.

- Two patches fix sender side issues.

- Final patch increases tcp_rmem[2] so that TCP speed over WAN
can meet modern needs.

Tested on a 200Gbit NIC, average max throughput of a single flow:

Before:
73593 Mbit.

After:
122514 Mbit.
====================

Link: https://patch.msgid.link/20250513193919.1089692-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+134 -66
+2 -2
Documentation/networking/ip-sysctl.rst
··· 735 735 net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables 736 736 automatic tuning of that socket's receive buffer size, in which 737 737 case this value is ignored. 738 - Default: between 131072 and 6MB, depending on RAM size. 738 + Default: between 131072 and 32MB, depending on RAM size. 739 739 740 740 tcp_sack - BOOLEAN 741 741 Enable select acknowledgments (SACKS). ··· 1099 1099 limits the number of bytes on qdisc or device to reduce artificial 1100 1100 RTT/cwnd and reduce bufferbloat. 1101 1101 1102 - Default: 1048576 (16 * 65536) 1102 + Default: 4194304 (4 MB) 1103 1103 1104 1104 tcp_challenge_ack_limit - INTEGER 1105 1105 Limits number of Challenge ACK sent per second, as recommended
+1 -1
include/linux/tcp.h
··· 340 340 } rcv_rtt_est; 341 341 /* Receiver queue space */ 342 342 struct { 343 - u32 space; 343 + int space; 344 344 u32 seq; 345 345 u64 time; 346 346 } rcvq_space;
+73
include/trace/events/tcp.h
··· 213 213 TP_ARGS(sk) 214 214 ); 215 215 216 + TRACE_EVENT(tcp_rcvbuf_grow, 217 + 218 + TP_PROTO(struct sock *sk, int time), 219 + 220 + TP_ARGS(sk, time), 221 + 222 + TP_STRUCT__entry( 223 + __field(int, time) 224 + __field(__u32, rtt_us) 225 + __field(__u32, copied) 226 + __field(__u32, inq) 227 + __field(__u32, space) 228 + __field(__u32, ooo_space) 229 + __field(__u32, rcvbuf) 230 + __field(__u8, scaling_ratio) 231 + __field(__u16, sport) 232 + __field(__u16, dport) 233 + __field(__u16, family) 234 + __array(__u8, saddr, 4) 235 + __array(__u8, daddr, 4) 236 + __array(__u8, saddr_v6, 16) 237 + __array(__u8, daddr_v6, 16) 238 + __field(const void *, skaddr) 239 + __field(__u64, sock_cookie) 240 + ), 241 + 242 + TP_fast_assign( 243 + struct inet_sock *inet = inet_sk(sk); 244 + struct tcp_sock *tp = tcp_sk(sk); 245 + __be32 *p32; 246 + 247 + __entry->time = time; 248 + __entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3; 249 + __entry->copied = tp->copied_seq - tp->rcvq_space.seq; 250 + __entry->inq = tp->rcv_nxt - tp->copied_seq; 251 + __entry->space = tp->rcvq_space.space; 252 + __entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 : 253 + TCP_SKB_CB(tp->ooo_last_skb)->end_seq - 254 + tp->rcv_nxt; 255 + 256 + __entry->rcvbuf = sk->sk_rcvbuf; 257 + __entry->scaling_ratio = tp->scaling_ratio; 258 + __entry->sport = ntohs(inet->inet_sport); 259 + __entry->dport = ntohs(inet->inet_dport); 260 + __entry->family = sk->sk_family; 261 + 262 + p32 = (__be32 *) __entry->saddr; 263 + *p32 = inet->inet_saddr; 264 + 265 + p32 = (__be32 *) __entry->daddr; 266 + *p32 = inet->inet_daddr; 267 + 268 + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, 269 + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); 270 + 271 + __entry->skaddr = sk; 272 + __entry->sock_cookie = sock_gen_cookie(sk); 273 + ), 274 + 275 + TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u " 276 + "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 " 277 + "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx", 278 + __entry->time, __entry->rtt_us, __entry->copied, 279 + __entry->inq, __entry->space, __entry->ooo_space, 280 + __entry->scaling_ratio, __entry->rcvbuf, 281 + show_family_name(__entry->family), 282 + __entry->sport, __entry->dport, 283 + __entry->saddr, __entry->daddr, 284 + __entry->saddr_v6, __entry->daddr_v6, 285 + __entry->skaddr, 286 + __entry->sock_cookie) 287 + ); 288 + 216 289 TRACE_EVENT(tcp_retransmit_synack, 217 290 218 291 TP_PROTO(const struct sock *sk, const struct request_sock *req),
+1 -1
net/ipv4/tcp.c
··· 5231 5231 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 5232 5232 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); 5233 5233 max_wshare = min(4UL*1024*1024, limit); 5234 - max_rshare = min(6UL*1024*1024, limit); 5234 + max_rshare = min(32UL*1024*1024, limit); 5235 5235 5236 5236 init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; 5237 5237 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+53 -57
net/ipv4/tcp_input.c
··· 664 664 */ 665 665 static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) 666 666 { 667 - u32 new_sample = tp->rcv_rtt_est.rtt_us; 668 - long m = sample; 667 + u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; 668 + long m = sample << 3; 669 669 670 - if (new_sample != 0) { 670 + if (old_sample == 0 || m < old_sample) { 671 + new_sample = m; 672 + } else { 671 673 /* If we sample in larger samples in the non-timestamp 672 674 * case, we could grossly overestimate the RTT especially 673 675 * with chatty applications or bulk transfer apps which ··· 680 678 * else with timestamps disabled convergence takes too 681 679 * long. 682 680 */ 683 - if (!win_dep) { 684 - m -= (new_sample >> 3); 685 - new_sample += m; 686 - } else { 687 - m <<= 3; 688 - if (m < new_sample) 689 - new_sample = m; 690 - } 691 - } else { 692 - /* No previous measure. */ 693 - new_sample = m << 3; 681 + if (win_dep) 682 + return; 683 + /* Do not use this sample if receive queue is not empty. */ 684 + if (tp->rcv_nxt != tp->copied_seq) 685 + return; 686 + new_sample = old_sample - (old_sample >> 3) + sample; 694 687 } 695 688 696 689 tp->rcv_rtt_est.rtt_us = new_sample; ··· 709 712 tp->rcv_rtt_est.time = tp->tcp_mstamp; 710 713 } 711 714 712 - static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) 715 + static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) 713 716 { 714 717 u32 delta, delta_us; 715 718 ··· 719 722 720 723 if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { 721 724 if (!delta) 722 - delta = 1; 725 + delta = min_delta; 723 726 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); 724 727 return delta_us; 725 728 } ··· 737 740 738 741 if (TCP_SKB_CB(skb)->end_seq - 739 742 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { 740 - s32 delta = tcp_rtt_tsopt_us(tp); 743 + s32 delta = tcp_rtt_tsopt_us(tp, 0); 741 744 742 - if (delta >= 0) 745 + if (delta > 0) 743 746 tcp_rcv_rtt_update(tp, delta, 0); 744 747 } 745 748 } 746 749 750 + static void tcp_rcvbuf_grow(struct sock *sk) 751 + { 752 + const struct net *net = sock_net(sk); 753 + struct tcp_sock *tp = tcp_sk(sk); 754 + int rcvwin, rcvbuf, cap; 755 + 756 + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || 757 + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 758 + return; 759 + 760 + /* slow start: allow the sender to double its rate. */ 761 + rcvwin = tp->rcvq_space.space << 1; 762 + 763 + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) 764 + rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; 765 + 766 + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); 767 + 768 + rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); 769 + if (rcvbuf > sk->sk_rcvbuf) { 770 + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 771 + /* Make the window clamp follow along. */ 772 + WRITE_ONCE(tp->window_clamp, 773 + tcp_win_from_space(sk, rcvbuf)); 774 + } 775 + } 747 776 /* 748 777 * This function should be called every time data is copied to user space. 749 778 * It calculates the appropriate TCP receive buffer space. ··· 777 754 void tcp_rcv_space_adjust(struct sock *sk) 778 755 { 779 756 struct tcp_sock *tp = tcp_sk(sk); 780 - u32 copied; 781 - int time; 757 + int time, inq, copied; 782 758 783 759 trace_tcp_rcv_space_adjust(sk); 784 760 ··· 788 766 789 767 /* Number of bytes copied to user in last RTT */ 790 768 copied = tp->copied_seq - tp->rcvq_space.seq; 769 + /* Number of bytes in receive queue. */ 770 + inq = tp->rcv_nxt - tp->copied_seq; 771 + copied -= inq; 791 772 if (copied <= tp->rcvq_space.space) 792 773 goto new_measure; 793 774 794 - /* A bit of theory : 795 - * copied = bytes received in previous RTT, our base window 796 - * To cope with packet losses, we need a 2x factor 797 - * To cope with slow start, and sender growing its cwin by 100 % 798 - * every RTT, we need a 4x factor, because the ACK we are sending 799 - * now is for the next RTT, not the current one : 800 - * <prev RTT . ><current RTT .. ><next RTT .... > 801 - */ 775 + trace_tcp_rcvbuf_grow(sk, time); 802 776 803 - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && 804 - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 805 - u64 rcvwin, grow; 806 - int rcvbuf; 807 - 808 - /* minimal window to cope with packet losses, assuming 809 - * steady state. Add some cushion because of small variations. 810 - */ 811 - rcvwin = ((u64)copied << 1) + 16 * tp->advmss; 812 - 813 - /* Accommodate for sender rate increase (eg. slow start) */ 814 - grow = rcvwin * (copied - tp->rcvq_space.space); 815 - do_div(grow, tp->rcvq_space.space); 816 - rcvwin += (grow << 1); 817 - 818 - rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), 819 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); 820 - if (rcvbuf > sk->sk_rcvbuf) { 821 - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 822 - 823 - /* Make the window clamp follow along. */ 824 - WRITE_ONCE(tp->window_clamp, 825 - tcp_win_from_space(sk, rcvbuf)); 826 - } 827 - } 828 777 tp->rcvq_space.space = copied; 778 + 779 + tcp_rcvbuf_grow(sk); 829 780 830 781 new_measure: 831 782 tp->rcvq_space.seq = tp->copied_seq; ··· 3221 3226 */ 3222 3227 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && 3223 3228 tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) 3224 - seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); 3229 + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); 3225 3230 3226 3231 rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ 3227 3232 if (seq_rtt_us < 0) ··· 5168 5173 skb_condense(skb); 5169 5174 skb_set_owner_r(skb, sk); 5170 5175 } 5176 + tcp_rcvbuf_grow(sk); 5171 5177 } 5172 5178 5173 5179 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, ··· 6869 6873 if (!tp->srtt_us) 6870 6874 tcp_synack_rtt_meas(sk, req); 6871 6875 6876 + if (tp->rx_opt.tstamp_ok) 6877 + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 6878 + 6872 6879 if (req) { 6873 6880 tcp_rcv_synrecv_state_fastopen(sk); 6874 6881 } else { ··· 6896 6897 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 6897 6898 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; 6898 6899 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 6899 - 6900 - if (tp->rx_opt.tstamp_ok) 6901 - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 6902 6900 6903 6901 if (!inet_csk(sk)->icsk_ca_ops->cong_control) 6904 6902 tcp_update_pacing_rate(sk);
+2 -2
net/ipv4/tcp_ipv4.c
··· 3495 3495 * which are too large can cause TCP streams to be bursty. 3496 3496 */ 3497 3497 net->ipv4.sysctl_tcp_tso_win_divisor = 3; 3498 - /* Default TSQ limit of 16 TSO segments */ 3499 - net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; 3498 + /* Default TSQ limit of 4 MB */ 3499 + net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; 3500 3500 3501 3501 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ 3502 3502 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
+2 -3
net/ipv4/tcp_output.c
··· 2619 2619 limit = max_t(unsigned long, 2620 2620 2 * skb->truesize, 2621 2621 READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); 2622 - if (sk->sk_pacing_status == SK_PACING_NONE) 2623 - limit = min_t(unsigned long, limit, 2624 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); 2622 + limit = min_t(unsigned long, limit, 2623 + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); 2625 2624 limit <<= factor; 2626 2625 2627 2626 if (static_branch_unlikely(&tcp_tx_delay_enabled) &&