tcp: add net.ipv4.tcp_comp_sack_rtt_percent

TCP SACK compression has been added in 2018 in commit
5d9f4262b7ea ("tcp: add SACK compression").

It is working great for WAN flows (with large RTT).
Wifi in particular gets a significant boost _when_ ACK are suppressed.

Add a new sysctl so that we can tune the very conservative 5 % value
that has been used so far in this formula, so that small RTT flows
can benefit from this feature.

delay = min ( 5 % of RTT, 1 ms)

This patch adds new tcp_comp_sack_rtt_percent sysctl
to ease experiments and tuning.

Given that we cap the delay to 1ms (tcp_comp_sack_delay_ns sysctl),
set the default value to 33 %.

Quoting Neal Cardwell ( https://lore.kernel.org/netdev/CADVnQymZ1tFnEA1Q=vtECs0=Db7zHQ8=+WCQtnhHFVbEOzjVnQ@mail.gmail.com/ )

The rationale for 33% is basically to try to facilitate pipelining,
where there are always at least 3 ACKs and 3 GSO/TSO skbs per SRTT, so
that the path can maintain a budget for 3 full-sized GSO/TSO skbs "in
flight" at all times:

+ 1 skb in the qdisc waiting to be sent by the NIC next
+ 1 skb being sent by the NIC (being serialized by the NIC out onto the wire)
+ 1 skb being received and aggregated by the receiver machine's
aggregation mechanism (some combination of LRO, GRO, and sack
compression)

Note that this is basically the same magic number (3) and the same
rationales as:

(a) tcp_tso_should_defer() ensuring that we defer sending data for no
longer than cwnd/tcp_tso_win_divisor (where tcp_tso_win_divisor = 3),
and
(b) bbr_quantization_budget() ensuring that cwnd is at least 3 GSO/TSO
skbs to maintain pipelining and full throughput at low RTTs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Link: https://patch.msgid.link/20251106115236.3450026-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eric Dumazet and committed by

Jakub Kicinski 7 months ago 416dd649 45cb3c6f

+40 -10

5 changed files

expand all

Documentation

networking

ip-sysctl.rst

include

net

netns

ipv4.h

net

ipv4

sysctl_net_ipv4.c

tcp_input.c

tcp_ipv4.c

+11 -2

Documentation/networking/ip-sysctl.rst

··· 854 854 855 855 Default: 1 (enabled) 856 856 857 + tcp_comp_sack_rtt_percent - INTEGER 858 + Percentage of SRTT used for the compressed SACK feature. 859 + See tcp_comp_sack_nr, tcp_comp_sack_delay_ns, tcp_comp_sack_slack_ns. 860 + 861 + Possible values : 1 - 1000 862 + 863 + Default : 33 % 864 + 857 865 tcp_comp_sack_delay_ns - LONG INTEGER 858 - TCP tries to reduce number of SACK sent, using a timer 859 - based on 5% of SRTT, capped by this sysctl, in nano seconds. 866 + TCP tries to reduce number of SACK sent, using a timer based 867 + on tcp_comp_sack_rtt_percent of SRTT, capped by this sysctl 868 + in nano seconds. 860 869 The default is 1ms, based on TSO autosizing period. 861 870 862 871 Default : 1,000,000 ns (1 ms)

include/net/netns/ipv4.h

··· 221 221 int sysctl_tcp_pacing_ss_ratio; 222 222 int sysctl_tcp_pacing_ca_ratio; 223 223 unsigned int sysctl_tcp_child_ehash_entries; 224 + int sysctl_tcp_comp_sack_rtt_percent; 224 225 unsigned long sysctl_tcp_comp_sack_delay_ns; 225 226 unsigned long sysctl_tcp_comp_sack_slack_ns; 226 227 int sysctl_max_syn_backlog;

net/ipv4/sysctl_net_ipv4.c

··· 1452 1452 .proc_handler = proc_doulongvec_minmax, 1453 1453 }, 1454 1454 { 1455 + .procname = "tcp_comp_sack_rtt_percent", 1456 + .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent, 1457 + .maxlen = sizeof(int), 1458 + .mode = 0644, 1459 + .proc_handler = proc_dointvec_minmax, 1460 + .extra1 = SYSCTL_ONE, 1461 + .extra2 = SYSCTL_ONE_THOUSAND, 1462 + }, 1463 + { 1455 1464 .procname = "tcp_comp_sack_slack_ns", 1456 1465 .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, 1457 1466 .maxlen = sizeof(unsigned long),

+18 -8

net/ipv4/tcp_input.c

··· 5893 5893 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) 5894 5894 { 5895 5895 struct tcp_sock *tp = tcp_sk(sk); 5896 - unsigned long rtt, delay; 5896 + struct net *net = sock_net(sk); 5897 + unsigned long rtt; 5898 + u64 delay; 5897 5899 5898 5900 /* More than one full frame received... */ 5899 5901 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ··· 5914 5912 * Defer the ack until tcp_release_cb(). 5915 5913 */ 5916 5914 if (sock_owned_by_user_nocheck(sk) && 5917 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) { 5915 + READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) { 5918 5916 set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); 5919 5917 return; 5920 5918 } ··· 5929 5927 } 5930 5928 5931 5929 if (!tcp_is_sack(tp) || 5932 - tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) 5930 + tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr)) 5933 5931 goto send_now; 5934 5932 5935 5933 if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { ··· 5944 5942 if (hrtimer_is_queued(&tp->compressed_ack_timer)) 5945 5943 return; 5946 5944 5947 - /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ 5945 + /* compress ack timer : comp_sack_rtt_percent of rtt, 5946 + * but no more than tcp_comp_sack_delay_ns. 5947 + */ 5948 5948 5949 5949 rtt = tp->rcv_rtt_est.rtt_us; 5950 5950 if (tp->srtt_us && tp->srtt_us < rtt) 5951 5951 rtt = tp->srtt_us; 5952 5952 5953 - delay = min_t(unsigned long, 5954 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), 5955 - rtt * (NSEC_PER_USEC >> 3)/20); 5953 + /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100 5954 + * -> 5955 + * delay = rtt * 1.25 * comp_sack_rtt_percent 5956 + */ 5957 + delay = (u64)(rtt + (rtt >> 2)) * 5958 + READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent); 5959 + 5960 + delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns)); 5961 + 5956 5962 sock_hold(sk); 5957 5963 hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), 5958 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), 5964 + READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns), 5959 5965 HRTIMER_MODE_REL_PINNED_SOFT); 5960 5966 } 5961 5967

net/ipv4/tcp_ipv4.c

··· 3595 3595 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; 3596 3596 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; 3597 3597 net->ipv4.sysctl_tcp_comp_sack_nr = 44; 3598 + net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; 3598 3599 net->ipv4.sysctl_tcp_backlog_ack_defer = 1; 3599 3600 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; 3600 3601 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;

Configure Feed

Configure Feed