Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-autotune-related-improvement'

Matthieu Baerts says:

====================
mptcp: autotune related improvement

Here are two patches from Paolo that have been crafted a couple of
months ago, but needed more validation because they were indirectly
causing instabilities in the sefltests. The root cause has been fixed in
'net' recently in commit 8c09412e584d ("selftests: mptcp: more stable
simult_flows tests").

These patches refactor the receive space and RTT estimator, overall
making DRS more correct while avoiding receive buffer drifting to
tcp_rmem[2], which in turn makes the throughput more stable and less
bursty, especially with high bandwidth and low delay environments.

Note that the first patch addresses a very old issue. 'net-next' is
targeted because the change is quite invasive and based on a recent
backlog refactor. The 'Fixes' tag is then there more as a FYI, because
backporting this patch will quickly be blocked due to large conflicts.
====================

Link: https://patch.msgid.link/20260407-net-next-mptcp-reduce-rbuf-v2-0-0d1d135bf6f6@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+77 -33
+1 -1
include/trace/events/mptcp.h
··· 219 219 __be32 *p32; 220 220 221 221 __entry->time = time; 222 - __entry->rtt_us = msk->rcvq_space.rtt_us >> 3; 222 + __entry->rtt_us = mptcp_rtt_us_est(msk) >> 3; 223 223 __entry->copied = msk->rcvq_space.copied; 224 224 __entry->inq = mptcp_inq_hint(sk); 225 225 __entry->space = msk->rcvq_space.space;
+40 -31
net/mptcp/protocol.c
··· 879 879 return moved; 880 880 } 881 881 882 + static void mptcp_rcv_rtt_update(struct mptcp_sock *msk, 883 + struct mptcp_subflow_context *subflow) 884 + { 885 + const struct tcp_sock *tp = tcp_sk(subflow->tcp_sock); 886 + u32 rtt_us = tp->rcv_rtt_est.rtt_us; 887 + int id; 888 + 889 + /* Update once per subflow per rcvwnd to avoid touching the msk 890 + * too often. 891 + */ 892 + if (!rtt_us || tp->rcv_rtt_est.seq == subflow->prev_rtt_seq) 893 + return; 894 + 895 + subflow->prev_rtt_seq = tp->rcv_rtt_est.seq; 896 + 897 + /* Pairs with READ_ONCE() in mptcp_rtt_us_est(). */ 898 + id = msk->rcv_rtt_est.next_sample; 899 + WRITE_ONCE(msk->rcv_rtt_est.samples[id], rtt_us); 900 + if (++msk->rcv_rtt_est.next_sample == MPTCP_RTT_SAMPLES) 901 + msk->rcv_rtt_est.next_sample = 0; 902 + 903 + /* EWMA among the incoming subflows */ 904 + msk->scaling_ratio = ((msk->scaling_ratio << 3) - msk->scaling_ratio + 905 + tp->scaling_ratio) >> 3; 906 + } 907 + 882 908 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 883 909 { 884 910 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); ··· 918 892 return; 919 893 920 894 mptcp_data_lock(sk); 895 + mptcp_rcv_rtt_update(msk, subflow); 921 896 if (!sock_owned_by_user(sk)) { 922 897 /* Wake-up the reader only for in-sequence data */ 923 898 if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) ··· 2122 2095 2123 2096 msk->rcvspace_init = 1; 2124 2097 msk->rcvq_space.copied = 0; 2125 - msk->rcvq_space.rtt_us = 0; 2126 2098 2127 2099 /* initial rcv_space offering made to peer */ 2128 2100 msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, ··· 2132 2106 2133 2107 /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. 2134 2108 * 2135 - * Only difference: Use highest rtt estimate of the subflows in use. 2109 + * Only difference: Use lowest rtt estimate of the subflows in use, see 2110 + * mptcp_rcv_rtt_update() and mptcp_rtt_us_est(). 2136 2111 */ 2137 2112 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) 2138 2113 { 2139 2114 struct mptcp_subflow_context *subflow; 2140 2115 struct sock *sk = (struct sock *)msk; 2141 - u8 scaling_ratio = U8_MAX; 2142 - u32 time, advmss = 1; 2143 - u64 rtt_us, mstamp; 2116 + u32 time, rtt_us; 2117 + u64 mstamp; 2144 2118 2145 2119 msk_owned_by_me(msk); 2146 2120 ··· 2155 2129 mstamp = mptcp_stamp(); 2156 2130 time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time)); 2157 2131 2158 - rtt_us = msk->rcvq_space.rtt_us; 2159 - if (rtt_us && time < (rtt_us >> 3)) 2132 + rtt_us = mptcp_rtt_us_est(msk); 2133 + if (rtt_us == U32_MAX || time < (rtt_us >> 3)) 2160 2134 return; 2161 2135 2162 - rtt_us = 0; 2163 - mptcp_for_each_subflow(msk, subflow) { 2164 - const struct tcp_sock *tp; 2165 - u64 sf_rtt_us; 2166 - u32 sf_advmss; 2167 - 2168 - tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); 2169 - 2170 - sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); 2171 - sf_advmss = READ_ONCE(tp->advmss); 2172 - 2173 - rtt_us = max(sf_rtt_us, rtt_us); 2174 - advmss = max(sf_advmss, advmss); 2175 - scaling_ratio = min(tp->scaling_ratio, scaling_ratio); 2176 - } 2177 - 2178 - msk->rcvq_space.rtt_us = rtt_us; 2179 - msk->scaling_ratio = scaling_ratio; 2180 - if (time < (rtt_us >> 3) || rtt_us == 0) 2181 - return; 2182 - 2183 - if (msk->rcvq_space.copied <= msk->rcvq_space.space) 2136 + copied = msk->rcvq_space.copied; 2137 + copied -= mptcp_inq_hint(sk); 2138 + if (copied <= msk->rcvq_space.space) 2184 2139 goto new_measure; 2185 2140 2186 2141 trace_mptcp_rcvbuf_grow(sk, time); 2187 - if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { 2142 + if (mptcp_rcvbuf_grow(sk, copied)) { 2188 2143 /* Make subflows follow along. If we do not do this, we 2189 2144 * get drops at subflow level if skbs can't be moved to 2190 2145 * the mptcp rx queue fast enough (announced rcv_win can ··· 2179 2172 slow = lock_sock_fast(ssk); 2180 2173 /* subflows can be added before tcp_init_transfer() */ 2181 2174 if (tcp_sk(ssk)->rcvq_space.space) 2182 - tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied); 2175 + tcp_rcvbuf_grow(ssk, copied); 2183 2176 unlock_sock_fast(ssk, slow); 2184 2177 } 2185 2178 } ··· 3022 3015 msk->timer_ival = TCP_RTO_MIN; 3023 3016 msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; 3024 3017 msk->backlog_len = 0; 3018 + mptcp_init_rtt_est(msk); 3025 3019 3026 3020 WRITE_ONCE(msk->first, NULL); 3027 3021 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; ··· 3468 3460 msk->bytes_retrans = 0; 3469 3461 msk->rcvspace_init = 0; 3470 3462 msk->fastclosing = 0; 3463 + mptcp_init_rtt_est(msk); 3471 3464 3472 3465 /* for fallback's sake */ 3473 3466 WRITE_ONCE(msk->ack_seq, 0);
+36 -1
net/mptcp/protocol.h
··· 269 269 struct page *page; 270 270 }; 271 271 272 + /* Arbitrary compromise between as low as possible to react timely to subflow 273 + * close event and as big as possible to avoid being fouled by biased large 274 + * samples due to peer sending data on a different subflow WRT to the incoming 275 + * ack. 276 + */ 277 + #define MPTCP_RTT_SAMPLES 5 278 + 272 279 /* MPTCP connection sock */ 273 280 struct mptcp_sock { 274 281 /* inet_connection_sock must be the first member */ ··· 348 341 */ 349 342 struct mptcp_pm_data pm; 350 343 struct mptcp_sched_ops *sched; 344 + 345 + /* Most recent rtt_us observed by in use incoming subflows. */ 346 + struct { 347 + u32 samples[MPTCP_RTT_SAMPLES]; 348 + u32 next_sample; 349 + } rcv_rtt_est; 350 + 351 351 struct { 352 352 int space; /* bytes copied in last measurement window */ 353 353 int copied; /* bytes copied in this measurement window */ 354 354 u64 time; /* start time of measurement window */ 355 - u64 rtt_us; /* last maximum rtt of subflows */ 356 355 } rcvq_space; 357 356 u8 scaling_ratio; 358 357 bool allow_subflows; ··· 434 421 const struct mptcp_sock *msk = mptcp_sk(sk); 435 422 436 423 return msk->first_pending; 424 + } 425 + 426 + static inline void mptcp_init_rtt_est(struct mptcp_sock *msk) 427 + { 428 + int i; 429 + 430 + for (i = 0; i < MPTCP_RTT_SAMPLES; ++i) 431 + msk->rcv_rtt_est.samples[i] = U32_MAX; 432 + msk->rcv_rtt_est.next_sample = 0; 433 + msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; 434 + } 435 + 436 + static inline u32 mptcp_rtt_us_est(const struct mptcp_sock *msk) 437 + { 438 + u32 rtt_us = READ_ONCE(msk->rcv_rtt_est.samples[0]); 439 + int i; 440 + 441 + /* Lockless access of collected samples. */ 442 + for (i = 1; i < MPTCP_RTT_SAMPLES; ++i) 443 + rtt_us = min(rtt_us, READ_ONCE(msk->rcv_rtt_est.samples[i])); 444 + return rtt_us; 437 445 } 438 446 439 447 static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) ··· 558 524 u32 map_data_len; 559 525 __wsum map_data_csum; 560 526 u32 map_csum_len; 527 + u32 prev_rtt_seq; 561 528 u32 request_mptcp : 1, /* send MP_CAPABLE */ 562 529 request_join : 1, /* send MP_JOIN */ 563 530 request_bkup : 1,