Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'tcp-tcp_rcvbuf_grow-changes'

Eric Dumazet says:

====================
tcp: tcp_rcvbuf_grow() changes

First pach is minor and moves tcp_moderate_rcvbuf in appropriate group.

Second patch is another attempt to keep small sk->sk_rcvbuf for DC
(small RT) TCP flows for optimal performance.
====================

Link: https://patch.msgid.link/20251119084813.3684576-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+42 -13
+10
Documentation/networking/ip-sysctl.rst
··· 673 673 674 674 Default: 1 (enabled) 675 675 676 + tcp_rcvbuf_low_rtt - INTEGER 677 + rcvbuf autotuning can over estimate final socket rcvbuf, which 678 + can lead to cache trashing for high throughput flows. 679 + 680 + For small RTT flows (below tcp_rcvbuf_low_rtt usecs), we can relax 681 + rcvbuf growth: Few additional ms to reach the final (and smaller) 682 + rcvbuf is a good tradeoff. 683 + 684 + Default : 1000 (1 ms) 685 + 676 686 tcp_mtu_probing - INTEGER 677 687 Controls TCP Packetization-Layer Path MTU Discovery. Takes three 678 688 values:
+2 -1
Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst
··· 102 102 u8 sysctl_tcp_frto tcp_enter_loss 103 103 u8 sysctl_tcp_nometrics_save TCP_LAST_ACK/tcp_update_metrics 104 104 u8 sysctl_tcp_no_ssthresh_metrics_save TCP_LAST_ACK/tcp_(update/init)_metrics 105 - u8 sysctl_tcp_moderate_rcvbuf read_mostly read_mostly tcp_tso_should_defer(tx);tcp_rcv_space_adjust(rx) 105 + u8 sysctl_tcp_moderate_rcvbuf read_mostly tcp_rcvbuf_grow() 106 + u32 sysctl_tcp_rcvbuf_low_rtt read_mostly tcp_rcvbuf_grow() 106 107 u8 sysctl_tcp_tso_win_divisor read_mostly tcp_tso_should_defer(tcp_write_xmit) 107 108 u8 sysctl_tcp_workaround_signed_windows tcp_select_window 108 109 int sysctl_tcp_limit_output_bytes read_mostly tcp_small_queue_check(tcp_write_xmit)
+2 -1
include/net/netns/ipv4.h
··· 74 74 75 75 /* TXRX readonly hotpath cache lines */ 76 76 __cacheline_group_begin(netns_ipv4_read_txrx); 77 - u8 sysctl_tcp_moderate_rcvbuf; 78 77 __cacheline_group_end(netns_ipv4_read_txrx); 79 78 80 79 /* RX readonly hotpath cache line */ 81 80 __cacheline_group_begin(netns_ipv4_read_rx); 81 + u8 sysctl_tcp_moderate_rcvbuf; 82 82 u8 sysctl_ip_early_demux; 83 83 u8 sysctl_tcp_early_demux; 84 84 u8 sysctl_tcp_l3mdev_accept; 85 85 /* 3 bytes hole, try to pack */ 86 86 int sysctl_tcp_reordering; 87 87 int sysctl_tcp_rmem[3]; 88 + int sysctl_tcp_rcvbuf_low_rtt; 88 89 __cacheline_group_end(netns_ipv4_read_rx); 89 90 90 91 struct inet_timewait_death_row tcp_death_row;
+4 -7
net/core/net_namespace.c
··· 1223 1223 sysctl_tcp_wmem); 1224 1224 CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, 1225 1225 sysctl_ip_fwd_use_pmtu); 1226 - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); 1227 - 1228 - /* TXRX readonly hotpath cache lines */ 1229 - CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, 1230 - sysctl_tcp_moderate_rcvbuf); 1231 - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); 1232 1226 1233 1227 /* RX readonly hotpath cache line */ 1228 + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, 1229 + sysctl_tcp_moderate_rcvbuf); 1230 + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, 1231 + sysctl_tcp_rcvbuf_low_rtt); 1234 1232 CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, 1235 1233 sysctl_ip_early_demux); 1236 1234 CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, ··· 1239 1241 sysctl_tcp_reordering); 1240 1242 CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, 1241 1243 sysctl_tcp_rmem); 1242 - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); 1243 1244 } 1244 1245 #endif 1245 1246
+9
net/ipv4/sysctl_net_ipv4.c
··· 1343 1343 .proc_handler = proc_dou8vec_minmax, 1344 1344 }, 1345 1345 { 1346 + .procname = "tcp_rcvbuf_low_rtt", 1347 + .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt, 1348 + .maxlen = sizeof(int), 1349 + .mode = 0644, 1350 + .proc_handler = proc_dointvec_minmax, 1351 + .extra1 = SYSCTL_ZERO, 1352 + .extra2 = SYSCTL_INT_MAX, 1353 + }, 1354 + { 1346 1355 .procname = "tcp_tso_win_divisor", 1347 1356 .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, 1348 1357 .maxlen = sizeof(u8),
+14 -4
net/ipv4/tcp_input.c
··· 896 896 const struct net *net = sock_net(sk); 897 897 struct tcp_sock *tp = tcp_sk(sk); 898 898 u32 rcvwin, rcvbuf, cap, oldval; 899 + u32 rtt_threshold, rtt_us; 899 900 u64 grow; 900 901 901 902 oldval = tp->rcvq_space.space; ··· 909 908 /* DRS is always one RTT late. */ 910 909 rcvwin = newval << 1; 911 910 912 - /* slow start: allow the sender to double its rate. */ 913 - grow = (u64)rcvwin * (newval - oldval); 914 - do_div(grow, oldval); 915 - rcvwin += grow << 1; 911 + rtt_us = tp->rcv_rtt_est.rtt_us >> 3; 912 + rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt); 913 + if (rtt_us < rtt_threshold) { 914 + /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold. 915 + * It might take few additional ms to reach 'line rate', 916 + * but will avoid sk_rcvbuf inflation and poor cache use. 917 + */ 918 + grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold); 919 + } else { 920 + /* slow start: allow the sender to double its rate. */ 921 + grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval); 922 + } 923 + rcvwin += grow; 916 924 917 925 if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) 918 926 rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
+1
net/ipv4/tcp_ipv4.c
··· 3566 3566 net->ipv4.sysctl_tcp_adv_win_scale = 1; 3567 3567 net->ipv4.sysctl_tcp_frto = 2; 3568 3568 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; 3569 + net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; 3569 3570 /* This limits the percentage of the congestion window which we 3570 3571 * will allow a single TSO frame to consume. Building TSO frames 3571 3572 * which are too large can cause TCP streams to be bursty.