Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-rx-path-refactor'

Matthieu Baerts says:

====================
mptcp: rx path refactor

Paolo worked on this RX path refactor for these two main reasons:

- Currently, the MPTCP RX path introduces quite a bit of 'exceptional'
accounting/locking processing WRT to plain TCP, adding up to the
implementation complexity in a miserable way.

- The performance gap WRT plain TCP for single subflow connections is
quite measurable.

The present refactor addresses both the above items: most of the
additional complexity is dropped, and single stream performances
increase measurably, from 55Gbps to 71Gbps in Paolo's loopback test.
As a reference, plain TCP was around 84Gbps on the same host.

The above comes to a price: the patch are invasive, even in subtle ways.

Note: patch 5/7 removes the sk_forward_alloc_get() helper, which caused
some trivial modifications in different places in the net tree: sockets,
IPv4, sched. That's why a few more people have been Cc here. Feel free
to only look at this patch 5/7.
====================

Link: https://patch.msgid.link/20250218-net-next-mptcp-rx-path-refactor-v1-0-4a47d90d7998@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+134 -289
-13
include/net/sock.h
··· 1285 1285 unsigned int inuse_idx; 1286 1286 #endif 1287 1287 1288 - #if IS_ENABLED(CONFIG_MPTCP) 1289 - int (*forward_alloc_get)(const struct sock *sk); 1290 - #endif 1291 - 1292 1288 bool (*stream_memory_free)(const struct sock *sk, int wake); 1293 1289 bool (*sock_is_readable)(struct sock *sk); 1294 1290 /* Memory pressure */ ··· 1344 1348 int sock_load_diag_module(int family, int protocol); 1345 1349 1346 1350 INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); 1347 - 1348 - static inline int sk_forward_alloc_get(const struct sock *sk) 1349 - { 1350 - #if IS_ENABLED(CONFIG_MPTCP) 1351 - if (sk->sk_prot->forward_alloc_get) 1352 - return sk->sk_prot->forward_alloc_get(sk); 1353 - #endif 1354 - return READ_ONCE(sk->sk_forward_alloc); 1355 - } 1356 1351 1357 1352 static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) 1358 1353 {
+1 -1
net/core/sock.c
··· 3882 3882 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3883 3883 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3884 3884 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3885 - mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); 3885 + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 3886 3886 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3887 3887 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3888 3888 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
+1 -1
net/ipv4/af_inet.c
··· 153 153 WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); 154 154 WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 155 155 WARN_ON_ONCE(sk->sk_wmem_queued); 156 - WARN_ON_ONCE(sk_forward_alloc_get(sk)); 156 + WARN_ON_ONCE(sk->sk_forward_alloc); 157 157 158 158 kfree(rcu_dereference_protected(inet->inet_opt, 1)); 159 159 dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
+1 -1
net/ipv4/inet_diag.c
··· 282 282 struct inet_diag_meminfo minfo = { 283 283 .idiag_rmem = sk_rmem_alloc_get(sk), 284 284 .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), 285 - .idiag_fmem = sk_forward_alloc_get(sk), 285 + .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), 286 286 .idiag_tmem = sk_wmem_alloc_get(sk), 287 287 }; 288 288
+4 -23
net/mptcp/fastopen.c
··· 40 40 tp->copied_seq += skb->len; 41 41 subflow->ssn_offset += skb->len; 42 42 43 - /* initialize a dummy sequence number, we will update it at MPC 44 - * completion, if needed 45 - */ 43 + /* Only the sequence delta is relevant */ 46 44 MPTCP_SKB_CB(skb)->map_seq = -skb->len; 47 45 MPTCP_SKB_CB(skb)->end_seq = 0; 48 46 MPTCP_SKB_CB(skb)->offset = 0; 49 47 MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 48 + MPTCP_SKB_CB(skb)->cant_coalesce = 1; 50 49 51 50 mptcp_data_lock(sk); 51 + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); 52 52 53 - mptcp_set_owner_r(skb, sk); 53 + skb_set_owner_r(skb, sk); 54 54 __skb_queue_tail(&sk->sk_receive_queue, skb); 55 55 mptcp_sk(sk)->bytes_received += skb->len; 56 56 57 57 sk->sk_data_ready(sk); 58 58 59 59 mptcp_data_unlock(sk); 60 - } 61 - 62 - void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, 63 - const struct mptcp_options_received *mp_opt) 64 - { 65 - struct sock *sk = (struct sock *)msk; 66 - struct sk_buff *skb; 67 - 68 - skb = skb_peek_tail(&sk->sk_receive_queue); 69 - if (skb) { 70 - WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); 71 - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, 72 - MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, 73 - MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); 74 - MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; 75 - MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; 76 - } 77 - 78 - pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); 79 60 }
+101 -216
net/mptcp/protocol.c
··· 118 118 __kfree_skb(skb); 119 119 } 120 120 121 - static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) 122 - { 123 - WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, 124 - mptcp_sk(sk)->rmem_fwd_alloc + size); 125 - } 126 - 127 - static void mptcp_rmem_charge(struct sock *sk, int size) 128 - { 129 - mptcp_rmem_fwd_alloc_add(sk, -size); 130 - } 131 - 132 121 static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, 133 122 struct sk_buff *from) 134 123 { 135 124 bool fragstolen; 136 125 int delta; 137 126 138 - if (MPTCP_SKB_CB(from)->offset || 127 + if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || 128 + MPTCP_SKB_CB(from)->offset || 139 129 ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || 140 130 !skb_try_coalesce(to, from, &fragstolen, &delta)) 141 131 return false; ··· 140 150 * negative one 141 151 */ 142 152 atomic_add(delta, &sk->sk_rmem_alloc); 143 - mptcp_rmem_charge(sk, delta); 153 + sk_mem_charge(sk, delta); 144 154 kfree_skb_partial(from, fragstolen); 145 155 146 156 return true; ··· 153 163 return false; 154 164 155 165 return mptcp_try_coalesce((struct sock *)msk, to, from); 156 - } 157 - 158 - static void __mptcp_rmem_reclaim(struct sock *sk, int amount) 159 - { 160 - amount >>= PAGE_SHIFT; 161 - mptcp_rmem_charge(sk, amount << PAGE_SHIFT); 162 - __sk_mem_reduce_allocated(sk, amount); 163 - } 164 - 165 - static void mptcp_rmem_uncharge(struct sock *sk, int size) 166 - { 167 - struct mptcp_sock *msk = mptcp_sk(sk); 168 - int reclaimable; 169 - 170 - mptcp_rmem_fwd_alloc_add(sk, size); 171 - reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); 172 - 173 - /* see sk_mem_uncharge() for the rationale behind the following schema */ 174 - if (unlikely(reclaimable >= PAGE_SIZE)) 175 - __mptcp_rmem_reclaim(sk, reclaimable); 176 - } 177 - 178 - static void mptcp_rfree(struct sk_buff *skb) 179 - { 180 - unsigned int len = skb->truesize; 181 - struct sock *sk = skb->sk; 182 - 183 - atomic_sub(len, &sk->sk_rmem_alloc); 184 - mptcp_rmem_uncharge(sk, len); 185 - } 186 - 187 - void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) 188 - { 189 - skb_orphan(skb); 190 - skb->sk = sk; 191 - skb->destructor = mptcp_rfree; 192 - atomic_add(skb->truesize, &sk->sk_rmem_alloc); 193 - mptcp_rmem_charge(sk, skb->truesize); 194 166 } 195 167 196 168 /* "inspired" by tcp_data_queue_ofo(), main differences: ··· 267 315 268 316 end: 269 317 skb_condense(skb); 270 - mptcp_set_owner_r(skb, sk); 271 - } 272 - 273 - static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) 274 - { 275 - struct mptcp_sock *msk = mptcp_sk(sk); 276 - int amt, amount; 277 - 278 - if (size <= msk->rmem_fwd_alloc) 279 - return true; 280 - 281 - size -= msk->rmem_fwd_alloc; 282 - amt = sk_mem_pages(size); 283 - amount = amt << PAGE_SHIFT; 284 - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) 285 - return false; 286 - 287 - mptcp_rmem_fwd_alloc_add(sk, amount); 288 - return true; 318 + skb_set_owner_r(skb, sk); 289 319 } 290 320 291 321 static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, ··· 285 351 skb_orphan(skb); 286 352 287 353 /* try to fetch required memory from subflow */ 288 - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { 354 + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 289 355 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 290 356 goto drop; 291 357 } ··· 300 366 MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; 301 367 MPTCP_SKB_CB(skb)->offset = offset; 302 368 MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; 369 + MPTCP_SKB_CB(skb)->cant_coalesce = 0; 303 370 304 371 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 305 372 /* in sequence */ ··· 310 375 if (tail && mptcp_try_coalesce(sk, tail, skb)) 311 376 return true; 312 377 313 - mptcp_set_owner_r(skb, sk); 378 + skb_set_owner_r(skb, sk); 314 379 __skb_queue_tail(&sk->sk_receive_queue, skb); 315 380 return true; 316 381 } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { ··· 496 561 bool cleanup, rx_empty; 497 562 498 563 cleanup = (space > 0) && (space >= (old_space << 1)) && copied; 499 - rx_empty = !__mptcp_rmem(sk) && copied; 564 + rx_empty = !sk_rmem_alloc_get(sk) && copied; 500 565 501 566 mptcp_for_each_subflow(msk, subflow) { 502 567 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ··· 569 634 } 570 635 571 636 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 572 - struct sock *ssk, 573 - unsigned int *bytes) 637 + struct sock *ssk) 574 638 { 575 639 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 576 640 struct sock *sk = (struct sock *)msk; 577 - unsigned int moved = 0; 578 641 bool more_data_avail; 579 642 struct tcp_sock *tp; 580 - bool done = false; 581 - int sk_rbuf; 582 - 583 - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 584 - 585 - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 586 - int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 587 - 588 - if (unlikely(ssk_rbuf > sk_rbuf)) { 589 - WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); 590 - sk_rbuf = ssk_rbuf; 591 - } 592 - } 643 + bool ret = false; 593 644 594 645 pr_debug("msk=%p ssk=%p\n", msk, ssk); 595 646 tp = tcp_sk(ssk); ··· 585 664 struct sk_buff *skb; 586 665 bool fin; 587 666 667 + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) 668 + break; 669 + 588 670 /* try to move as much data as available */ 589 671 map_remaining = subflow->map_data_len - 590 672 mptcp_subflow_get_map_offset(subflow); 591 673 592 674 skb = skb_peek(&ssk->sk_receive_queue); 593 - if (!skb) { 594 - /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), 595 - * a different CPU can have already processed the pending 596 - * data, stop here or we can enter an infinite loop 597 - */ 598 - if (!moved) 599 - done = true; 675 + if (unlikely(!skb)) 600 676 break; 601 - } 602 677 603 678 if (__mptcp_check_fallback(msk)) { 604 679 /* Under fallback skbs have no MPTCP extension and TCP could ··· 607 690 608 691 offset = seq - TCP_SKB_CB(skb)->seq; 609 692 fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 610 - if (fin) { 611 - done = true; 693 + if (fin) 612 694 seq++; 613 - } 614 695 615 696 if (offset < skb->len) { 616 697 size_t len = skb->len - offset; 617 698 618 - if (tp->urg_data) 619 - done = true; 620 - 621 - if (__mptcp_move_skb(msk, ssk, skb, offset, len)) 622 - moved += len; 699 + ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; 623 700 seq += len; 624 701 625 702 if (unlikely(map_remaining < len)) { ··· 627 716 } 628 717 629 718 sk_eat_skb(ssk, skb); 630 - done = true; 631 719 } 632 720 633 721 WRITE_ONCE(tp->copied_seq, seq); 634 722 more_data_avail = mptcp_subflow_data_available(ssk); 635 723 636 - if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { 637 - done = true; 638 - break; 639 - } 640 724 } while (more_data_avail); 641 725 642 - if (moved > 0) 726 + if (ret) 643 727 msk->last_data_recv = tcp_jiffies32; 644 - *bytes += moved; 645 - return done; 728 + return ret; 646 729 } 647 730 648 731 static bool __mptcp_ofo_queue(struct mptcp_sock *msk) ··· 730 825 static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) 731 826 { 732 827 struct sock *sk = (struct sock *)msk; 733 - unsigned int moved = 0; 828 + bool moved; 734 829 735 - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 830 + moved = __mptcp_move_skbs_from_subflow(msk, ssk); 736 831 __mptcp_ofo_queue(msk); 737 832 if (unlikely(ssk->sk_err)) { 738 833 if (!sock_owned_by_user(sk)) ··· 748 843 */ 749 844 if (mptcp_pending_data_fin(sk, NULL)) 750 845 mptcp_schedule_work(sk); 751 - return moved > 0; 846 + return moved; 847 + } 848 + 849 + static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) 850 + { 851 + if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) 852 + WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); 853 + } 854 + 855 + static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) 856 + { 857 + struct mptcp_sock *msk = mptcp_sk(sk); 858 + 859 + __mptcp_rcvbuf_update(sk, ssk); 860 + 861 + /* Wake-up the reader only for in-sequence data */ 862 + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) 863 + sk->sk_data_ready(sk); 752 864 } 753 865 754 866 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 755 867 { 756 868 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 757 - struct mptcp_sock *msk = mptcp_sk(sk); 758 - int sk_rbuf, ssk_rbuf; 759 869 760 870 /* The peer can send data while we are shutting down this 761 871 * subflow at msk destruction time, but we must avoid enqueuing ··· 779 859 if (unlikely(subflow->disposable)) 780 860 return; 781 861 782 - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); 783 - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); 784 - if (unlikely(ssk_rbuf > sk_rbuf)) 785 - sk_rbuf = ssk_rbuf; 786 - 787 - /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ 788 - if (__mptcp_rmem(sk) > sk_rbuf) 789 - return; 790 - 791 - /* Wake-up the reader only for in-sequence data */ 792 862 mptcp_data_lock(sk); 793 - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) 794 - sk->sk_data_ready(sk); 863 + if (!sock_owned_by_user(sk)) 864 + __mptcp_data_ready(sk, ssk); 865 + else 866 + __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); 795 867 mptcp_data_unlock(sk); 796 868 } 797 869 ··· 860 948 return true; 861 949 } 862 950 return false; 863 - } 864 - 865 - static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) 866 - { 867 - struct mptcp_subflow_context *subflow; 868 - 869 - msk_owned_by_me(msk); 870 - 871 - mptcp_for_each_subflow(msk, subflow) { 872 - if (READ_ONCE(subflow->data_avail)) 873 - return mptcp_subflow_tcp_sock(subflow); 874 - } 875 - 876 - return NULL; 877 951 } 878 952 879 953 static bool mptcp_skb_can_collapse_to(u64 write_seq, ··· 1842 1944 1843 1945 static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); 1844 1946 1845 - static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, 1947 + static int __mptcp_recvmsg_mskq(struct sock *sk, 1846 1948 struct msghdr *msg, 1847 1949 size_t len, int flags, 1848 1950 struct scm_timestamping_internal *tss, 1849 1951 int *cmsg_flags) 1850 1952 { 1953 + struct mptcp_sock *msk = mptcp_sk(sk); 1851 1954 struct sk_buff *skb, *tmp; 1852 1955 int copied = 0; 1853 1956 1854 - skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { 1957 + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { 1855 1958 u32 offset = MPTCP_SKB_CB(skb)->offset; 1856 1959 u32 data_len = skb->len - offset; 1857 1960 u32 count = min_t(size_t, len - copied, data_len); ··· 1884 1985 } 1885 1986 1886 1987 if (!(flags & MSG_PEEK)) { 1887 - /* we will bulk release the skb memory later */ 1988 + /* avoid the indirect call, we know the destructor is sock_wfree */ 1888 1989 skb->destructor = NULL; 1889 - WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); 1890 - __skb_unlink(skb, &msk->receive_queue); 1990 + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1991 + sk_mem_uncharge(sk, skb->truesize); 1992 + __skb_unlink(skb, &sk->sk_receive_queue); 1891 1993 __kfree_skb(skb); 1892 1994 msk->bytes_consumed += count; 1893 1995 } ··· 2001 2101 msk->rcvq_space.time = mstamp; 2002 2102 } 2003 2103 2004 - static void __mptcp_update_rmem(struct sock *sk) 2104 + static struct mptcp_subflow_context * 2105 + __mptcp_first_ready_from(struct mptcp_sock *msk, 2106 + struct mptcp_subflow_context *subflow) 2005 2107 { 2006 - struct mptcp_sock *msk = mptcp_sk(sk); 2108 + struct mptcp_subflow_context *start_subflow = subflow; 2007 2109 2008 - if (!msk->rmem_released) 2009 - return; 2010 - 2011 - atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); 2012 - mptcp_rmem_uncharge(sk, msk->rmem_released); 2013 - WRITE_ONCE(msk->rmem_released, 0); 2110 + while (!READ_ONCE(subflow->data_avail)) { 2111 + subflow = mptcp_next_subflow(msk, subflow); 2112 + if (subflow == start_subflow) 2113 + return NULL; 2114 + } 2115 + return subflow; 2014 2116 } 2015 2117 2016 - static void __mptcp_splice_receive_queue(struct sock *sk) 2118 + static bool __mptcp_move_skbs(struct sock *sk) 2017 2119 { 2120 + struct mptcp_subflow_context *subflow; 2018 2121 struct mptcp_sock *msk = mptcp_sk(sk); 2122 + bool ret = false; 2019 2123 2020 - skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); 2021 - } 2124 + if (list_empty(&msk->conn_list)) 2125 + return false; 2022 2126 2023 - static bool __mptcp_move_skbs(struct mptcp_sock *msk) 2024 - { 2025 - struct sock *sk = (struct sock *)msk; 2026 - unsigned int moved = 0; 2027 - bool ret, done; 2127 + /* verify we can move any data from the subflow, eventually updating */ 2128 + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) 2129 + mptcp_for_each_subflow(msk, subflow) 2130 + __mptcp_rcvbuf_update(sk, subflow->tcp_sock); 2028 2131 2029 - do { 2030 - struct sock *ssk = mptcp_subflow_recv_lookup(msk); 2132 + subflow = list_first_entry(&msk->conn_list, 2133 + struct mptcp_subflow_context, node); 2134 + for (;;) { 2135 + struct sock *ssk; 2031 2136 bool slowpath; 2032 2137 2033 - /* we can have data pending in the subflows only if the msk 2034 - * receive buffer was full at subflow_data_ready() time, 2035 - * that is an unlikely slow path. 2138 + /* 2139 + * As an optimization avoid traversing the subflows list 2140 + * and ev. acquiring the subflow socket lock before baling out 2036 2141 */ 2037 - if (likely(!ssk)) 2142 + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) 2038 2143 break; 2039 2144 2040 - slowpath = lock_sock_fast(ssk); 2041 - mptcp_data_lock(sk); 2042 - __mptcp_update_rmem(sk); 2043 - done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); 2044 - mptcp_data_unlock(sk); 2145 + subflow = __mptcp_first_ready_from(msk, subflow); 2146 + if (!subflow) 2147 + break; 2045 2148 2149 + ssk = mptcp_subflow_tcp_sock(subflow); 2150 + slowpath = lock_sock_fast(ssk); 2151 + ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; 2046 2152 if (unlikely(ssk->sk_err)) 2047 2153 __mptcp_error_report(sk); 2048 2154 unlock_sock_fast(ssk, slowpath); 2049 - } while (!done); 2050 2155 2051 - /* acquire the data lock only if some input data is pending */ 2052 - ret = moved > 0; 2053 - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || 2054 - !skb_queue_empty_lockless(&sk->sk_receive_queue)) { 2055 - mptcp_data_lock(sk); 2056 - __mptcp_update_rmem(sk); 2057 - ret |= __mptcp_ofo_queue(msk); 2058 - __mptcp_splice_receive_queue(sk); 2059 - mptcp_data_unlock(sk); 2156 + subflow = mptcp_next_subflow(msk, subflow); 2060 2157 } 2158 + 2159 + __mptcp_ofo_queue(msk); 2061 2160 if (ret) 2062 2161 mptcp_check_data_fin((struct sock *)msk); 2063 - return !skb_queue_empty(&msk->receive_queue); 2162 + return ret; 2064 2163 } 2065 2164 2066 2165 static unsigned int mptcp_inq_hint(const struct sock *sk) ··· 2067 2168 const struct mptcp_sock *msk = mptcp_sk(sk); 2068 2169 const struct sk_buff *skb; 2069 2170 2070 - skb = skb_peek(&msk->receive_queue); 2171 + skb = skb_peek(&sk->sk_receive_queue); 2071 2172 if (skb) { 2072 2173 u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; 2073 2174 ··· 2113 2214 while (copied < len) { 2114 2215 int err, bytes_read; 2115 2216 2116 - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); 2217 + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); 2117 2218 if (unlikely(bytes_read < 0)) { 2118 2219 if (!copied) 2119 2220 copied = bytes_read; ··· 2122 2223 2123 2224 copied += bytes_read; 2124 2225 2125 - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) 2226 + if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) 2126 2227 continue; 2127 2228 2128 2229 /* only the MPTCP socket status is relevant here. The exit ··· 2148 2249 /* race breaker: the shutdown could be after the 2149 2250 * previous receive queue check 2150 2251 */ 2151 - if (__mptcp_move_skbs(msk)) 2252 + if (__mptcp_move_skbs(sk)) 2152 2253 continue; 2153 2254 break; 2154 2255 } ··· 2192 2293 } 2193 2294 } 2194 2295 2195 - pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", 2196 - msk, skb_queue_empty_lockless(&sk->sk_receive_queue), 2197 - skb_queue_empty(&msk->receive_queue), copied); 2296 + pr_debug("msk=%p rx queue empty=%d copied=%d\n", 2297 + msk, skb_queue_empty(&sk->sk_receive_queue), copied); 2198 2298 2199 2299 release_sock(sk); 2200 2300 return copied; ··· 2720 2822 INIT_LIST_HEAD(&msk->join_list); 2721 2823 INIT_LIST_HEAD(&msk->rtx_queue); 2722 2824 INIT_WORK(&msk->work, mptcp_worker); 2723 - __skb_queue_head_init(&msk->receive_queue); 2724 2825 msk->out_of_order_queue = RB_ROOT; 2725 2826 msk->first_pending = NULL; 2726 - WRITE_ONCE(msk->rmem_fwd_alloc, 0); 2727 - WRITE_ONCE(msk->rmem_released, 0); 2728 2827 msk->timer_ival = TCP_RTO_MIN; 2729 2828 msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; 2730 2829 ··· 2947 3052 2948 3053 sk->sk_prot->destroy(sk); 2949 3054 2950 - WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); 2951 - WARN_ON_ONCE(msk->rmem_released); 2952 3055 sk_stream_kill_queues(sk); 2953 3056 xfrm_sk_free_policy(sk); 2954 3057 ··· 3298 3405 mptcp_for_each_subflow_safe(msk, subflow, tmp) 3299 3406 __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); 3300 3407 3301 - /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ 3302 - mptcp_data_lock(sk); 3303 - skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); 3304 3408 __skb_queue_purge(&sk->sk_receive_queue); 3305 3409 skb_rbtree_purge(&msk->out_of_order_queue); 3306 - mptcp_data_unlock(sk); 3307 3410 3308 3411 /* move all the rx fwd alloc into the sk_mem_reclaim_final in 3309 3412 * inet_sock_destruct() will dispose it 3310 3413 */ 3311 - sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); 3312 - WRITE_ONCE(msk->rmem_fwd_alloc, 0); 3313 3414 mptcp_token_destroy(msk); 3314 3415 mptcp_pm_free_anno_list(msk); 3315 3416 mptcp_free_local_addr_list(msk); ··· 3340 3453 3341 3454 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ 3342 3455 BIT(MPTCP_RETRANSMIT) | \ 3343 - BIT(MPTCP_FLUSH_JOIN_LIST)) 3456 + BIT(MPTCP_FLUSH_JOIN_LIST) | \ 3457 + BIT(MPTCP_DEQUEUE)) 3344 3458 3345 3459 /* processes deferred events and flush wmem */ 3346 3460 static void mptcp_release_cb(struct sock *sk) ··· 3375 3487 __mptcp_push_pending(sk, 0); 3376 3488 if (flags & BIT(MPTCP_RETRANSMIT)) 3377 3489 __mptcp_retrans(sk); 3490 + if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { 3491 + /* notify ack seq update */ 3492 + mptcp_cleanup_rbuf(msk, 0); 3493 + sk->sk_data_ready(sk); 3494 + } 3378 3495 3379 3496 cond_resched(); 3380 3497 spin_lock_bh(&sk->sk_lock.slock); ··· 3399 3506 if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) 3400 3507 __mptcp_sync_sndbuf(sk); 3401 3508 } 3402 - 3403 - __mptcp_update_rmem(sk); 3404 3509 } 3405 3510 3406 3511 /* MP_JOIN client subflow must wait for 4th ack before sending any data: ··· 3569 3678 __mptcp_wr_shutdown(sk); 3570 3679 } 3571 3680 3572 - static int mptcp_forward_alloc_get(const struct sock *sk) 3573 - { 3574 - return READ_ONCE(sk->sk_forward_alloc) + 3575 - READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); 3576 - } 3577 - 3578 3681 static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) 3579 3682 { 3580 3683 const struct sock *sk = (void *)msk; ··· 3609 3724 return -EINVAL; 3610 3725 3611 3726 lock_sock(sk); 3612 - __mptcp_move_skbs(msk); 3727 + if (__mptcp_move_skbs(sk)) 3728 + mptcp_cleanup_rbuf(msk, 0); 3613 3729 *karg = mptcp_inq_hint(sk); 3614 3730 release_sock(sk); 3615 3731 break; ··· 3727 3841 .hash = mptcp_hash, 3728 3842 .unhash = mptcp_unhash, 3729 3843 .get_port = mptcp_get_port, 3730 - .forward_alloc_get = mptcp_forward_alloc_get, 3731 3844 .stream_memory_free = mptcp_stream_memory_free, 3732 3845 .sockets_allocated = &mptcp_sockets_allocated, 3733 3846
+7 -15
net/mptcp/protocol.h
··· 124 124 #define MPTCP_FLUSH_JOIN_LIST 5 125 125 #define MPTCP_SYNC_STATE 6 126 126 #define MPTCP_SYNC_SNDBUF 7 127 + #define MPTCP_DEQUEUE 8 127 128 128 129 struct mptcp_skb_cb { 129 130 u64 map_seq; 130 131 u64 end_seq; 131 132 u32 offset; 132 - u8 has_rxtstamp:1; 133 + u8 has_rxtstamp; 134 + u8 cant_coalesce; 133 135 }; 134 136 135 137 #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) ··· 281 279 u64 rcv_data_fin_seq; 282 280 u64 bytes_retrans; 283 281 u64 bytes_consumed; 284 - int rmem_fwd_alloc; 285 282 int snd_burst; 286 283 int old_wspace; 287 284 u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; ··· 295 294 u32 last_ack_recv; 296 295 unsigned long timer_ival; 297 296 u32 token; 298 - int rmem_released; 299 297 unsigned long flags; 300 298 unsigned long cb_flags; 301 299 bool recovery; /* closing subflow write queue reinjected */ ··· 324 324 struct work_struct work; 325 325 struct sk_buff *ooo_last_skb; 326 326 struct rb_root out_of_order_queue; 327 - struct sk_buff_head receive_queue; 328 327 struct list_head conn_list; 329 328 struct list_head rtx_queue; 330 329 struct mptcp_data_frag *first_pending; ··· 354 355 list_for_each_entry(__subflow, &((__msk)->conn_list), node) 355 356 #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ 356 357 list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) 358 + #define mptcp_next_subflow(__msk, __subflow) \ 359 + list_next_entry_circular(__subflow, &((__msk)->conn_list), node) 357 360 358 361 extern struct genl_family mptcp_genl_family; 359 362 ··· 382 381 #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) 383 382 #endif 384 383 385 - /* the msk socket don't use the backlog, also account for the bulk 386 - * free memory 387 - */ 388 - static inline int __mptcp_rmem(const struct sock *sk) 389 - { 390 - return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); 391 - } 392 - 393 384 static inline int mptcp_win_from_space(const struct sock *sk, int space) 394 385 { 395 386 return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); ··· 394 401 395 402 static inline int __mptcp_space(const struct sock *sk) 396 403 { 397 - return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); 404 + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - 405 + sk_rmem_alloc_get(sk)); 398 406 } 399 407 400 408 static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) ··· 1053 1059 enum mptcp_event_type event); 1054 1060 bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); 1055 1061 1056 - void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, 1057 - const struct mptcp_options_received *mp_opt); 1058 1062 void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, 1059 1063 struct request_sock *req); 1060 1064 int mptcp_nl_fill_addr(struct sk_buff *skb,
+18 -18
net/mptcp/subflow.c
··· 802 802 subflow_set_remote_key(msk, subflow, mp_opt); 803 803 WRITE_ONCE(subflow->fully_established, true); 804 804 WRITE_ONCE(msk->fully_established, true); 805 - 806 - if (subflow->is_mptfo) 807 - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); 808 805 } 809 806 810 807 static struct sock *subflow_syn_recv_sock(const struct sock *sk, ··· 1268 1271 subflow->map_valid = 0; 1269 1272 } 1270 1273 1271 - /* sched mptcp worker to remove the subflow if no more data is pending */ 1274 + static bool subflow_is_done(const struct sock *sk) 1275 + { 1276 + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1277 + } 1278 + 1279 + /* sched mptcp worker for subflow cleanup if no more data is pending */ 1272 1280 static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) 1273 1281 { 1274 1282 struct sock *sk = (struct sock *)msk; ··· 1283 1281 inet_sk_state_load(sk) != TCP_ESTABLISHED))) 1284 1282 return; 1285 1283 1286 - if (skb_queue_empty(&ssk->sk_receive_queue) && 1287 - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 1284 + if (!skb_queue_empty(&ssk->sk_receive_queue)) 1285 + return; 1286 + 1287 + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 1288 + mptcp_schedule_work(sk); 1289 + 1290 + /* when the fallback subflow closes the rx side, trigger a 'dummy' 1291 + * ingress data fin, so that the msk state will follow along 1292 + */ 1293 + if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && 1294 + msk->first == ssk && 1295 + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) 1288 1296 mptcp_schedule_work(sk); 1289 1297 } 1290 1298 ··· 1854 1842 rcu_read_unlock(); 1855 1843 } 1856 1844 1857 - static bool subflow_is_done(const struct sock *sk) 1858 - { 1859 - return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1860 - } 1861 - 1862 1845 static void subflow_state_change(struct sock *sk) 1863 1846 { 1864 1847 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); ··· 1880 1873 subflow_error_report(sk); 1881 1874 1882 1875 subflow_sched_work_if_closed(mptcp_sk(parent), sk); 1883 - 1884 - /* when the fallback subflow closes the rx side, trigger a 'dummy' 1885 - * ingress data fin, so that the msk state will follow along 1886 - */ 1887 - if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && 1888 - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) 1889 - mptcp_schedule_work(parent); 1890 1876 } 1891 1877 1892 1878 void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
+1 -1
net/sched/em_meta.c
··· 460 460 *err = -1; 461 461 return; 462 462 } 463 - dst->value = sk_forward_alloc_get(sk); 463 + dst->value = READ_ONCE(sk->sk_forward_alloc); 464 464 } 465 465 466 466 META_COLLECTOR(int_sk_sndbuf)