Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-receive-path-improvement'

Matthieu Baerts says:

====================
mptcp: receive path improvement

This series includes several changes to the MPTCP RX path. The main
goals are improving the RX performances, and increase the long term
maintainability.

Some changes reflects recent(ish) improvements introduced in the TCP
stack: patch 1, 2 and 3 are the MPTCP counter part of SKB deferral free
and auto-tuning improvements. Note that patch 3 could possibly fix
additional issues, and overall such patch should protect from similar
issues to arise in the future.

Patches 4-7 are aimed at introducing the socket backlog usage which will
be done in a later series to process the packets received by the
different subflows while the msk socket is owned.

Patch 8 is not related to the RX path, but it contains additional tests
for new features recently introduced in net-next.
====================

Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-0-5da266aa9c1a@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+177 -95
+1
include/net/tcp.h
··· 370 370 int tcp_ioctl(struct sock *sk, int cmd, int *karg); 371 371 enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); 372 372 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); 373 + void tcp_rcvbuf_grow(struct sock *sk); 373 374 void tcp_rcv_space_adjust(struct sock *sk); 374 375 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); 375 376 void tcp_twsk_destructor(struct sock *sk);
+1 -1
net/ipv4/tcp_input.c
··· 891 891 } 892 892 } 893 893 894 - static void tcp_rcvbuf_grow(struct sock *sk) 894 + void tcp_rcvbuf_grow(struct sock *sk) 895 895 { 896 896 const struct net *net = sock_net(sk); 897 897 struct tcp_sock *tp = tcp_sk(sk);
+95 -92
net/mptcp/protocol.c
··· 142 142 __kfree_skb(skb); 143 143 } 144 144 145 - static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 146 - struct sk_buff *from) 145 + static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 146 + struct sk_buff *from, bool *fragstolen, 147 + int *delta) 147 148 { 148 - bool fragstolen; 149 - int delta; 149 + int limit = READ_ONCE(sk->sk_rcvbuf); 150 150 151 151 if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || 152 152 MPTCP_SKB_CB(from)->offset || 153 - ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || 154 - !skb_try_coalesce(to, from, &fragstolen, &delta)) 153 + ((to->len + from->len) > (limit >> 3)) || 154 + !skb_try_coalesce(to, from, fragstolen, delta)) 155 155 return false; 156 156 157 157 pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", 158 158 MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, 159 159 to->len, MPTCP_SKB_CB(from)->end_seq); 160 160 MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; 161 + return true; 162 + } 163 + 164 + static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 165 + struct sk_buff *from) 166 + { 167 + bool fragstolen; 168 + int delta; 169 + 170 + if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta)) 171 + return false; 161 172 162 173 /* note the fwd memory can reach a negative value after accounting 163 174 * for the delta, but the later skb free will restore a non ··· 188 177 return false; 189 178 190 179 return mptcp_try_coalesce((struct sock *)msk, to, from); 180 + } 181 + 182 + /* "inspired" by tcp_rcvbuf_grow(), main difference: 183 + * - mptcp does not maintain a msk-level window clamp 184 + * - returns true when the receive buffer is actually updated 185 + */ 186 + static bool mptcp_rcvbuf_grow(struct sock *sk) 187 + { 188 + struct mptcp_sock *msk = mptcp_sk(sk); 189 + const struct net *net = sock_net(sk); 190 + int rcvwin, rcvbuf, cap; 191 + 192 + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || 193 + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 194 + return false; 195 + 196 + rcvwin = msk->rcvq_space.space << 1; 197 + 198 + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) 199 + rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; 200 + 201 + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); 202 + 203 + rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); 204 + if (rcvbuf > sk->sk_rcvbuf) { 205 + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 206 + return true; 207 + } 208 + return false; 191 209 } 192 210 193 211 /* "inspired" by tcp_data_queue_ofo(), main differences: ··· 332 292 end: 333 293 skb_condense(skb); 334 294 skb_set_owner_r(skb, sk); 295 + /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ 296 + if (sk->sk_socket) 297 + mptcp_rcvbuf_grow(sk); 335 298 } 336 299 337 - static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, 338 - struct sk_buff *skb, unsigned int offset, 339 - size_t copy_len) 300 + static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, 301 + int copy_len) 340 302 { 341 - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 342 - struct sock *sk = (struct sock *)msk; 343 - struct sk_buff *tail; 344 - bool has_rxtstamp; 345 - 346 - __skb_unlink(skb, &ssk->sk_receive_queue); 347 - 348 - skb_ext_reset(skb); 349 - skb_orphan(skb); 350 - 351 - /* try to fetch required memory from subflow */ 352 - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 353 - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 354 - goto drop; 355 - } 356 - 357 - has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 303 + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 304 + bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 358 305 359 306 /* the skb map_seq accounts for the skb offset: 360 307 * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq ··· 352 325 MPTCP_SKB_CB(skb)->offset = offset; 353 326 MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; 354 327 MPTCP_SKB_CB(skb)->cant_coalesce = 0; 328 + 329 + __skb_unlink(skb, &ssk->sk_receive_queue); 330 + 331 + skb_ext_reset(skb); 332 + skb_dst_drop(skb); 333 + } 334 + 335 + static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) 336 + { 337 + u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq; 338 + struct mptcp_sock *msk = mptcp_sk(sk); 339 + struct sk_buff *tail; 340 + 341 + /* try to fetch required memory from subflow */ 342 + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 343 + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 344 + goto drop; 345 + } 355 346 356 347 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 357 348 /* in sequence */ ··· 691 646 if (offset < skb->len) { 692 647 size_t len = skb->len - offset; 693 648 694 - ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; 649 + mptcp_init_skb(ssk, skb, offset, len); 650 + skb_orphan(skb); 651 + ret = __mptcp_move_skb(sk, skb) || ret; 695 652 seq += len; 696 653 697 654 if (unlikely(map_remaining < len)) { ··· 814 767 815 768 moved = __mptcp_move_skbs_from_subflow(msk, ssk); 816 769 __mptcp_ofo_queue(msk); 817 - if (unlikely(ssk->sk_err)) { 818 - if (!sock_owned_by_user(sk)) 819 - __mptcp_error_report(sk); 820 - else 821 - __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); 822 - } 770 + if (unlikely(ssk->sk_err)) 771 + __mptcp_subflow_error_report(sk, ssk); 823 772 824 773 /* If the moves have caught up with the DATA_FIN sequence number 825 774 * it's time to ack the DATA_FIN and change socket state, but ··· 827 784 return moved; 828 785 } 829 786 830 - static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) 831 - { 832 - if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) 833 - WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); 834 - } 835 - 836 787 static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) 837 788 { 838 789 struct mptcp_sock *msk = mptcp_sk(sk); 839 - 840 - __mptcp_rcvbuf_update(sk, ssk); 841 790 842 791 /* Wake-up the reader only for in-sequence data */ 843 792 if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) ··· 1978 1943 } 1979 1944 1980 1945 if (!(flags & MSG_PEEK)) { 1981 - /* avoid the indirect call, we know the destructor is sock_wfree */ 1946 + /* avoid the indirect call, we know the destructor is sock_rfree */ 1982 1947 skb->destructor = NULL; 1948 + skb->sk = NULL; 1983 1949 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1984 1950 sk_mem_uncharge(sk, skb->truesize); 1985 1951 __skb_unlink(skb, &sk->sk_receive_queue); 1986 - __kfree_skb(skb); 1952 + skb_attempt_defer_free(skb); 1987 1953 msk->bytes_consumed += count; 1988 1954 } 1989 1955 ··· 2049 2013 if (msk->rcvq_space.copied <= msk->rcvq_space.space) 2050 2014 goto new_measure; 2051 2015 2052 - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && 2053 - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 2054 - u64 rcvwin, grow; 2055 - int rcvbuf; 2016 + msk->rcvq_space.space = msk->rcvq_space.copied; 2017 + if (mptcp_rcvbuf_grow(sk)) { 2056 2018 2057 - rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; 2019 + /* Make subflows follow along. If we do not do this, we 2020 + * get drops at subflow level if skbs can't be moved to 2021 + * the mptcp rx queue fast enough (announced rcv_win can 2022 + * exceed ssk->sk_rcvbuf). 2023 + */ 2024 + mptcp_for_each_subflow(msk, subflow) { 2025 + struct sock *ssk; 2026 + bool slow; 2058 2027 2059 - grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); 2060 - 2061 - do_div(grow, msk->rcvq_space.space); 2062 - rcvwin += (grow << 1); 2063 - 2064 - rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), 2065 - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); 2066 - 2067 - if (rcvbuf > sk->sk_rcvbuf) { 2068 - u32 window_clamp; 2069 - 2070 - window_clamp = mptcp_win_from_space(sk, rcvbuf); 2071 - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 2072 - 2073 - /* Make subflows follow along. If we do not do this, we 2074 - * get drops at subflow level if skbs can't be moved to 2075 - * the mptcp rx queue fast enough (announced rcv_win can 2076 - * exceed ssk->sk_rcvbuf). 2077 - */ 2078 - mptcp_for_each_subflow(msk, subflow) { 2079 - struct sock *ssk; 2080 - bool slow; 2081 - 2082 - ssk = mptcp_subflow_tcp_sock(subflow); 2083 - slow = lock_sock_fast(ssk); 2084 - WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); 2085 - WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); 2086 - if (tcp_can_send_ack(ssk)) 2087 - tcp_cleanup_rbuf(ssk, 1); 2088 - unlock_sock_fast(ssk, slow); 2089 - } 2028 + ssk = mptcp_subflow_tcp_sock(subflow); 2029 + slow = lock_sock_fast(ssk); 2030 + tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; 2031 + tcp_rcvbuf_grow(ssk); 2032 + unlock_sock_fast(ssk, slow); 2090 2033 } 2091 2034 } 2092 2035 2093 - msk->rcvq_space.space = msk->rcvq_space.copied; 2094 2036 new_measure: 2095 2037 msk->rcvq_space.copied = 0; 2096 2038 msk->rcvq_space.time = mstamp; ··· 2096 2082 2097 2083 if (list_empty(&msk->conn_list)) 2098 2084 return false; 2099 - 2100 - /* verify we can move any data from the subflow, eventually updating */ 2101 - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 2102 - mptcp_for_each_subflow(msk, subflow) 2103 - __mptcp_rcvbuf_update(sk, subflow->tcp_sock); 2104 2085 2105 2086 subflow = list_first_entry(&msk->conn_list, 2106 2087 struct mptcp_subflow_context, node); ··· 2214 2205 break; 2215 2206 } 2216 2207 2217 - if (sk->sk_shutdown & RCV_SHUTDOWN) { 2218 - /* race breaker: the shutdown could be after the 2219 - * previous receive queue check 2220 - */ 2221 - if (__mptcp_move_skbs(sk)) 2222 - continue; 2208 + if (sk->sk_shutdown & RCV_SHUTDOWN) 2223 2209 break; 2224 - } 2225 2210 2226 2211 if (sk->sk_state == TCP_CLOSE) { 2227 2212 copied = -ENOTCONN;
+2 -2
net/mptcp/protocol.h
··· 341 341 struct mptcp_pm_data pm; 342 342 struct mptcp_sched_ops *sched; 343 343 struct { 344 - u32 space; /* bytes copied in last measurement window */ 345 - u32 copied; /* bytes copied in this measurement window */ 344 + int space; /* bytes copied in last measurement window */ 345 + int copied; /* bytes copied in this measurement window */ 346 346 u64 time; /* start time of measurement window */ 347 347 u64 rtt_us; /* last maximum rtt of subflows */ 348 348 } rcvq_space;
+69
tools/testing/selftests/net/mptcp/mptcp_join.sh
··· 2320 2320 fi 2321 2321 } 2322 2322 2323 + laminar_endp_tests() 2324 + { 2325 + # no laminar endpoints: routing rules are used 2326 + if reset_with_tcp_filter "without a laminar endpoint" ns1 10.0.2.2 REJECT && 2327 + mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then 2328 + pm_nl_set_limits $ns1 0 2 2329 + pm_nl_set_limits $ns2 2 2 2330 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal 2331 + run_tests $ns1 $ns2 10.0.1.1 2332 + join_syn_tx=1 \ 2333 + chk_join_nr 0 0 0 2334 + chk_add_nr 1 1 2335 + fi 2336 + 2337 + # laminar endpoints: this endpoint is used 2338 + if reset_with_tcp_filter "with a laminar endpoint" ns1 10.0.2.2 REJECT && 2339 + mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then 2340 + pm_nl_set_limits $ns1 0 2 2341 + pm_nl_set_limits $ns2 2 2 2342 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal 2343 + pm_nl_add_endpoint $ns2 10.0.3.2 flags laminar 2344 + run_tests $ns1 $ns2 10.0.1.1 2345 + chk_join_nr 1 1 1 2346 + chk_add_nr 1 1 2347 + fi 2348 + 2349 + # laminar endpoints: these endpoints are used 2350 + if reset_with_tcp_filter "with multiple laminar endpoints" ns1 10.0.2.2 REJECT && 2351 + mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then 2352 + pm_nl_set_limits $ns1 0 2 2353 + pm_nl_set_limits $ns2 2 2 2354 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal 2355 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal 2356 + pm_nl_add_endpoint $ns2 dead:beef:3::2 flags laminar 2357 + pm_nl_add_endpoint $ns2 10.0.3.2 flags laminar 2358 + pm_nl_add_endpoint $ns2 10.0.4.2 flags laminar 2359 + run_tests $ns1 $ns2 10.0.1.1 2360 + chk_join_nr 2 2 2 2361 + chk_add_nr 2 2 2362 + fi 2363 + 2364 + # laminar endpoints: only one endpoint is used 2365 + if reset_with_tcp_filter "single laminar endpoint" ns1 10.0.2.2 REJECT && 2366 + mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then 2367 + pm_nl_set_limits $ns1 0 2 2368 + pm_nl_set_limits $ns2 2 2 2369 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal 2370 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal 2371 + pm_nl_add_endpoint $ns2 10.0.3.2 flags laminar 2372 + run_tests $ns1 $ns2 10.0.1.1 2373 + chk_join_nr 1 1 1 2374 + chk_add_nr 2 2 2375 + fi 2376 + 2377 + # laminar endpoints: subflow and laminar flags 2378 + if reset_with_tcp_filter "sublow + laminar endpoints" ns1 10.0.2.2 REJECT && 2379 + mptcp_lib_kallsyms_has "mptcp_pm_get_endp_laminar_max$"; then 2380 + pm_nl_set_limits $ns1 0 4 2381 + pm_nl_set_limits $ns2 2 4 2382 + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal 2383 + pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,laminar 2384 + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,laminar 2385 + run_tests $ns1 $ns2 10.0.1.1 2386 + chk_join_nr 1 1 1 2387 + chk_add_nr 1 1 2388 + fi 2389 + } 2390 + 2323 2391 link_failure_tests() 2324 2392 { 2325 2393 # accept and use add_addr with additional subflows and link loss ··· 4177 4109 f@subflows_tests 4178 4110 e@subflows_error_tests 4179 4111 s@signal_address_tests 4112 + L@laminar_endp_tests 4180 4113 l@link_failure_tests 4181 4114 t@add_addr_timeout_tests 4182 4115 r@remove_tests
+9
tools/testing/selftests/net/mptcp/pm_nl_ctl.c
··· 830 830 flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; 831 831 else if (!strcmp(tok, "signal")) 832 832 flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; 833 + else if (!strcmp(tok, "laminar")) 834 + flags |= MPTCP_PM_ADDR_FLAG_LAMINAR; 833 835 else if (!strcmp(tok, "backup")) 834 836 flags |= MPTCP_PM_ADDR_FLAG_BACKUP; 835 837 else if (!strcmp(tok, "fullmesh")) ··· 1016 1014 if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { 1017 1015 printf("subflow"); 1018 1016 flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW; 1017 + if (flags) 1018 + printf(","); 1019 + } 1020 + 1021 + if (flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { 1022 + printf("laminar"); 1023 + flags &= ~MPTCP_PM_ADDR_FLAG_LAMINAR; 1019 1024 if (flags) 1020 1025 printf(","); 1021 1026 }