Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-memcg-accounting-for-passive-sockets-backlog-processing'

Matthieu Baerts says:

====================
mptcp: memcg accounting for passive sockets & backlog processing

This series is split in two: the 4 first patches are linked to memcg
accounting for passive sockets, and the rest introduce the backlog
processing. They are sent together, because the first one appeared to be
needed to get the second one fully working.

The second part includes RX path improvement built around backlog
processing. The main goals are improving the RX performances _and_
increase the long term maintainability.

- Patches 1-3: preparation work to ease the introduction of the next
patch.

- Patch 4: fix memcg accounting for passive sockets. Note that this is a
(non-urgent) fix, but it depends on material that is currently only in
net-next, e.g. commit 4a997d49d92a ("tcp: Save lock_sock() for memcg
in inet_csk_accept().").

- Patches 5-6: preparation of the stack for backlog processing, removing
assumptions that will not hold true any more after the backlog
introduction.

- Patches 7,8,10,11,12 are more cleanups that will make the backlog
patch a little less huge.

- Patch 9: somewhat an unrelated cleanup, included here not to forget
about it.

- Patches 13-14: The real work is done by them. Patch 13 introduces the
helpers needed to manipulate the msk-level backlog, and the data
struct itself, without any actual functional change. Patch 14 finally
uses the backlog for RX skb processing. Note that MPTCP can't use the
sk_backlog, as the MPTCP release callback can also release and
re-acquire the msk-level spinlock and core backlog processing works
under the assumption that such event is not possible.
A relevant point is memory accounts for skbs in the backlog. It's
somewhat "original" due to MPTCP constraints. Such skbs use space from
the incoming subflow receive buffer, do not use explicitly any forward
allocated memory, as we can't update the msk fwd mem while enqueuing,
nor we want to acquire again the ssk socket lock while processing the
skbs. Instead the msk borrows memory from the subflow and reserve it
for the backlog, see patch 5 and 14 for the gory details.
====================

Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-0-1f34b6c1e0b1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+417 -156
+2
include/net/sock.h
··· 1631 1631 sk_mem_reclaim(sk); 1632 1632 } 1633 1633 1634 + void __sk_charge(struct sock *sk, gfp_t gfp); 1635 + 1634 1636 #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) 1635 1637 static inline void sk_owner_set(struct sock *sk, struct module *owner) 1636 1638 {
+18
net/core/sock.c
··· 3448 3448 } 3449 3449 EXPORT_SYMBOL(__sk_mem_reclaim); 3450 3450 3451 + void __sk_charge(struct sock *sk, gfp_t gfp) 3452 + { 3453 + int amt; 3454 + 3455 + gfp |= __GFP_NOFAIL; 3456 + if (mem_cgroup_from_sk(sk)) { 3457 + /* The socket has not been accepted yet, no need 3458 + * to look at newsk->sk_wmem_queued. 3459 + */ 3460 + amt = sk_mem_pages(sk->sk_forward_alloc + 3461 + atomic_read(&sk->sk_rmem_alloc)); 3462 + if (amt) 3463 + mem_cgroup_sk_charge(sk, amt, gfp); 3464 + } 3465 + 3466 + kmem_cache_charge(sk, gfp); 3467 + } 3468 + 3451 3469 int sk_set_peek_off(struct sock *sk, int val) 3452 3470 { 3453 3471 WRITE_ONCE(sk->sk_peek_off, val);
+1 -16
net/ipv4/af_inet.c
··· 756 756 void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) 757 757 { 758 758 if (mem_cgroup_sockets_enabled) { 759 - gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; 760 - 761 759 mem_cgroup_sk_alloc(newsk); 762 - 763 - if (mem_cgroup_from_sk(newsk)) { 764 - int amt; 765 - 766 - /* The socket has not been accepted yet, no need 767 - * to look at newsk->sk_wmem_queued. 768 - */ 769 - amt = sk_mem_pages(newsk->sk_forward_alloc + 770 - atomic_read(&newsk->sk_rmem_alloc)); 771 - if (amt) 772 - mem_cgroup_sk_charge(newsk, amt, gfp); 773 - } 774 - 775 - kmem_cache_charge(newsk, gfp); 760 + __sk_charge(newsk, GFP_KERNEL); 776 761 } 777 762 778 763 sock_rps_record_flow(newsk);
+3 -1
net/mptcp/fastopen.c
··· 32 32 /* dequeue the skb from sk receive queue */ 33 33 __skb_unlink(skb, &ssk->sk_receive_queue); 34 34 skb_ext_reset(skb); 35 - skb_orphan(skb); 35 + 36 + mptcp_subflow_lend_fwdmem(subflow, skb); 36 37 37 38 /* We copy the fastopen data, but that don't belong to the mptcp sequence 38 39 * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset() ··· 51 50 mptcp_data_lock(sk); 52 51 DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); 53 52 53 + mptcp_borrow_fwdmem(sk, skb); 54 54 skb_set_owner_r(skb, sk); 55 55 __skb_queue_tail(&sk->sk_receive_queue, skb); 56 56 mptcp_sk(sk)->bytes_received += skb->len;
-1
net/mptcp/mib.c
··· 71 71 SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), 72 72 SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), 73 73 SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), 74 - SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), 75 74 SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), 76 75 SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), 77 76 SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED),
-1
net/mptcp/mib.h
··· 70 70 MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ 71 71 MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ 72 72 MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ 73 - MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ 74 73 MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ 75 74 MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ 76 75 MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */
+2 -1
net/mptcp/mptcp_diag.c
··· 195 195 struct mptcp_sock *msk = mptcp_sk(sk); 196 196 struct mptcp_info *info = _info; 197 197 198 - r->idiag_rqueue = sk_rmem_alloc_get(sk); 198 + r->idiag_rqueue = sk_rmem_alloc_get(sk) + 199 + READ_ONCE(mptcp_sk(sk)->backlog_len); 199 200 r->idiag_wqueue = sk_wmem_alloc_get(sk); 200 201 201 202 if (inet_sk_state_load(sk) == TCP_LISTEN) {
+3 -1
net/mptcp/pm.c
··· 594 594 void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, 595 595 const struct mptcp_subflow_context *subflow) 596 596 { 597 + struct sock *sk = (struct sock *)msk; 597 598 struct mptcp_pm_data *pm = &msk->pm; 598 599 bool update_subflows; 599 600 ··· 618 617 /* Even if this subflow is not really established, tell the PM to try 619 618 * to pick the next ones, if possible. 620 619 */ 621 - if (mptcp_pm_nl_check_work_pending(msk)) 620 + if (mptcp_is_fully_established(sk) && 621 + mptcp_pm_nl_check_work_pending(msk)) 622 622 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 623 623 624 624 spin_unlock_bh(&pm->lock);
+2
net/mptcp/pm_kernel.c
··· 337 337 struct mptcp_pm_local local; 338 338 339 339 mptcp_mpc_endpoint_setup(msk); 340 + if (!mptcp_is_fully_established(sk)) 341 + return; 340 342 341 343 pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", 342 344 msk->pm.local_addr_used, endp_subflow_max,
+307 -121
net/mptcp/protocol.c
··· 358 358 static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, 359 359 int copy_len) 360 360 { 361 - const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 361 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 362 362 bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; 363 363 364 364 /* the skb map_seq accounts for the skb offset: ··· 383 383 struct mptcp_sock *msk = mptcp_sk(sk); 384 384 struct sk_buff *tail; 385 385 386 - /* try to fetch required memory from subflow */ 387 - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 388 - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); 389 - goto drop; 390 - } 386 + mptcp_borrow_fwdmem(sk, skb); 391 387 392 388 if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { 393 389 /* in sequence */ ··· 405 409 * will retransmit as needed, if needed. 406 410 */ 407 411 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); 408 - drop: 409 412 mptcp_drop(sk, skb); 410 413 return false; 411 414 } ··· 659 664 } 660 665 } 661 666 667 + static void __mptcp_add_backlog(struct sock *sk, 668 + struct mptcp_subflow_context *subflow, 669 + struct sk_buff *skb) 670 + { 671 + struct mptcp_sock *msk = mptcp_sk(sk); 672 + struct sk_buff *tail = NULL; 673 + struct sock *ssk = skb->sk; 674 + bool fragstolen; 675 + int delta; 676 + 677 + if (unlikely(sk->sk_state == TCP_CLOSE)) { 678 + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 679 + return; 680 + } 681 + 682 + /* Try to coalesce with the last skb in our backlog */ 683 + if (!list_empty(&msk->backlog_list)) 684 + tail = list_last_entry(&msk->backlog_list, struct sk_buff, list); 685 + 686 + if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq && 687 + ssk == tail->sk && 688 + __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) { 689 + skb->truesize -= delta; 690 + kfree_skb_partial(skb, fragstolen); 691 + __mptcp_subflow_lend_fwdmem(subflow, delta); 692 + goto account; 693 + } 694 + 695 + list_add_tail(&skb->list, &msk->backlog_list); 696 + mptcp_subflow_lend_fwdmem(subflow, skb); 697 + delta = skb->truesize; 698 + 699 + account: 700 + WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta); 701 + 702 + /* Possibly not accept()ed yet, keep track of memory not CG 703 + * accounted, mptcp_graft_subflows() will handle it. 704 + */ 705 + if (!mem_cgroup_from_sk(ssk)) 706 + msk->backlog_unaccounted += delta; 707 + } 708 + 662 709 static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, 663 - struct sock *ssk) 710 + struct sock *ssk, bool own_msk) 664 711 { 665 712 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 666 713 struct sock *sk = (struct sock *)msk; ··· 717 680 u32 seq = tp->copied_seq; 718 681 struct sk_buff *skb; 719 682 bool fin; 720 - 721 - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) 722 - break; 723 683 724 684 /* try to move as much data as available */ 725 685 map_remaining = subflow->map_data_len - ··· 744 710 size_t len = skb->len - offset; 745 711 746 712 mptcp_init_skb(ssk, skb, offset, len); 747 - skb_orphan(skb); 748 - ret = __mptcp_move_skb(sk, skb) || ret; 713 + 714 + if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) { 715 + mptcp_subflow_lend_fwdmem(subflow, skb); 716 + ret |= __mptcp_move_skb(sk, skb); 717 + } else { 718 + __mptcp_add_backlog(sk, subflow, skb); 719 + } 749 720 seq += len; 750 721 751 722 if (unlikely(map_remaining < len)) { ··· 869 830 struct sock *sk = (struct sock *)msk; 870 831 bool moved; 871 832 872 - moved = __mptcp_move_skbs_from_subflow(msk, ssk); 833 + moved = __mptcp_move_skbs_from_subflow(msk, ssk, true); 873 834 __mptcp_ofo_queue(msk); 874 835 if (unlikely(ssk->sk_err)) 875 836 __mptcp_subflow_error_report(sk, ssk); ··· 884 845 return moved; 885 846 } 886 847 887 - static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) 888 - { 889 - struct mptcp_sock *msk = mptcp_sk(sk); 890 - 891 - /* Wake-up the reader only for in-sequence data */ 892 - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) 893 - sk->sk_data_ready(sk); 894 - } 895 - 896 848 void mptcp_data_ready(struct sock *sk, struct sock *ssk) 897 849 { 898 850 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 851 + struct mptcp_sock *msk = mptcp_sk(sk); 899 852 900 853 /* The peer can send data while we are shutting down this 901 - * subflow at msk destruction time, but we must avoid enqueuing 854 + * subflow at subflow destruction time, but we must avoid enqueuing 902 855 * more data to the msk receive queue 903 856 */ 904 - if (unlikely(subflow->disposable)) 857 + if (unlikely(subflow->closing)) 905 858 return; 906 859 907 860 mptcp_data_lock(sk); 908 - if (!sock_owned_by_user(sk)) 909 - __mptcp_data_ready(sk, ssk); 910 - else 911 - __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); 861 + if (!sock_owned_by_user(sk)) { 862 + /* Wake-up the reader only for in-sequence data */ 863 + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) 864 + sk->sk_data_ready(sk); 865 + } else { 866 + __mptcp_move_skbs_from_subflow(msk, ssk, false); 867 + } 912 868 mptcp_data_unlock(sk); 913 869 } 914 870 ··· 928 894 } 929 895 mptcp_subflow_joined(msk, ssk); 930 896 spin_unlock_bh(&msk->fallback_lock); 931 - 932 - /* attach to msk socket only after we are sure we will deal with it 933 - * at close time 934 - */ 935 - if (sk->sk_socket && !ssk->sk_socket) 936 - mptcp_sock_graft(ssk, sk->sk_socket); 937 897 938 898 mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; 939 899 mptcp_sockopt_sync_locked(msk, ssk); ··· 2142 2114 msk->rcvq_space.time = mstamp; 2143 2115 } 2144 2116 2145 - static struct mptcp_subflow_context * 2146 - __mptcp_first_ready_from(struct mptcp_sock *msk, 2147 - struct mptcp_subflow_context *subflow) 2117 + static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta) 2148 2118 { 2149 - struct mptcp_subflow_context *start_subflow = subflow; 2150 - 2151 - while (!READ_ONCE(subflow->data_avail)) { 2152 - subflow = mptcp_next_subflow(msk, subflow); 2153 - if (subflow == start_subflow) 2154 - return NULL; 2155 - } 2156 - return subflow; 2157 - } 2158 - 2159 - static bool __mptcp_move_skbs(struct sock *sk) 2160 - { 2161 - struct mptcp_subflow_context *subflow; 2119 + struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list); 2162 2120 struct mptcp_sock *msk = mptcp_sk(sk); 2163 - bool ret = false; 2121 + bool moved = false; 2164 2122 2165 - if (list_empty(&msk->conn_list)) 2166 - return false; 2167 - 2168 - subflow = list_first_entry(&msk->conn_list, 2169 - struct mptcp_subflow_context, node); 2170 - for (;;) { 2171 - struct sock *ssk; 2172 - bool slowpath; 2173 - 2174 - /* 2175 - * As an optimization avoid traversing the subflows list 2176 - * and ev. acquiring the subflow socket lock before baling out 2177 - */ 2123 + *delta = 0; 2124 + while (1) { 2125 + /* If the msk recvbuf is full stop, don't drop */ 2178 2126 if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) 2179 2127 break; 2180 2128 2181 - subflow = __mptcp_first_ready_from(msk, subflow); 2182 - if (!subflow) 2129 + prefetch(skb->next); 2130 + list_del(&skb->list); 2131 + *delta += skb->truesize; 2132 + 2133 + moved |= __mptcp_move_skb(sk, skb); 2134 + if (list_empty(skbs)) 2183 2135 break; 2184 2136 2185 - ssk = mptcp_subflow_tcp_sock(subflow); 2186 - slowpath = lock_sock_fast(ssk); 2187 - ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; 2188 - if (unlikely(ssk->sk_err)) 2189 - __mptcp_error_report(sk); 2190 - unlock_sock_fast(ssk, slowpath); 2191 - 2192 - subflow = mptcp_next_subflow(msk, subflow); 2137 + skb = list_first_entry(skbs, struct sk_buff, list); 2193 2138 } 2194 2139 2195 2140 __mptcp_ofo_queue(msk); 2196 - if (ret) 2141 + if (moved) 2197 2142 mptcp_check_data_fin((struct sock *)msk); 2198 - return ret; 2143 + return moved; 2144 + } 2145 + 2146 + static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs) 2147 + { 2148 + struct mptcp_sock *msk = mptcp_sk(sk); 2149 + 2150 + /* After CG initialization, subflows should never add skb before 2151 + * gaining the CG themself. 2152 + */ 2153 + DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket && 2154 + mem_cgroup_from_sk(sk)); 2155 + 2156 + /* Don't spool the backlog if the rcvbuf is full. */ 2157 + if (list_empty(&msk->backlog_list) || 2158 + sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) 2159 + return false; 2160 + 2161 + INIT_LIST_HEAD(skbs); 2162 + list_splice_init(&msk->backlog_list, skbs); 2163 + return true; 2164 + } 2165 + 2166 + static void mptcp_backlog_spooled(struct sock *sk, u32 moved, 2167 + struct list_head *skbs) 2168 + { 2169 + struct mptcp_sock *msk = mptcp_sk(sk); 2170 + 2171 + WRITE_ONCE(msk->backlog_len, msk->backlog_len - moved); 2172 + list_splice(skbs, &msk->backlog_list); 2173 + } 2174 + 2175 + static bool mptcp_move_skbs(struct sock *sk) 2176 + { 2177 + struct list_head skbs; 2178 + bool enqueued = false; 2179 + u32 moved; 2180 + 2181 + mptcp_data_lock(sk); 2182 + while (mptcp_can_spool_backlog(sk, &skbs)) { 2183 + mptcp_data_unlock(sk); 2184 + enqueued |= __mptcp_move_skbs(sk, &skbs, &moved); 2185 + 2186 + mptcp_data_lock(sk); 2187 + mptcp_backlog_spooled(sk, moved, &skbs); 2188 + } 2189 + mptcp_data_unlock(sk); 2190 + return enqueued; 2199 2191 } 2200 2192 2201 2193 static unsigned int mptcp_inq_hint(const struct sock *sk) ··· 2281 2233 2282 2234 copied += bytes_read; 2283 2235 2284 - if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) 2236 + if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk)) 2285 2237 continue; 2286 2238 2287 2239 /* only the MPTCP socket status is relevant here. The exit ··· 2495 2447 { 2496 2448 struct mptcp_sock *msk = mptcp_sk(sk); 2497 2449 bool dispose_it, need_push = false; 2450 + int fwd_remaining; 2451 + 2452 + /* Do not pass RX data to the msk, even if the subflow socket is not 2453 + * going to be freed (i.e. even for the first subflow on graceful 2454 + * subflow close. 2455 + */ 2456 + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 2457 + subflow->closing = 1; 2458 + 2459 + /* Borrow the fwd allocated page left-over; fwd memory for the subflow 2460 + * could be negative at this point, but will be reach zero soon - when 2461 + * the data allocated using such fragment will be freed. 2462 + */ 2463 + if (subflow->lent_mem_frag) { 2464 + fwd_remaining = PAGE_SIZE - subflow->lent_mem_frag; 2465 + sk_forward_alloc_add(sk, fwd_remaining); 2466 + sk_forward_alloc_add(ssk, -fwd_remaining); 2467 + subflow->lent_mem_frag = 0; 2468 + } 2498 2469 2499 2470 /* If the first subflow moved to a close state before accept, e.g. due 2500 2471 * to an incoming reset or listener shutdown, the subflow socket is ··· 2525 2458 /* ensure later check in mptcp_worker() will dispose the msk */ 2526 2459 sock_set_flag(sk, SOCK_DEAD); 2527 2460 mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); 2528 - lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 2529 2461 mptcp_subflow_drop_ctx(ssk); 2530 2462 goto out_release; 2531 2463 } ··· 2532 2466 dispose_it = msk->free_first || ssk != msk->first; 2533 2467 if (dispose_it) 2534 2468 list_del(&subflow->node); 2535 - 2536 - lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); 2537 2469 2538 2470 if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE) 2539 2471 tcp_set_state(ssk, TCP_CLOSE); ··· 2595 2531 void mptcp_close_ssk(struct sock *sk, struct sock *ssk, 2596 2532 struct mptcp_subflow_context *subflow) 2597 2533 { 2534 + struct mptcp_sock *msk = mptcp_sk(sk); 2535 + struct sk_buff *skb; 2536 + 2598 2537 /* The first subflow can already be closed and still in the list */ 2599 2538 if (subflow->close_event_done) 2600 2539 return; ··· 2606 2539 2607 2540 if (sk->sk_state == TCP_ESTABLISHED) 2608 2541 mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); 2542 + 2543 + /* Remove any reference from the backlog to this ssk; backlog skbs consume 2544 + * space in the msk receive queue, no need to touch sk->sk_rmem_alloc 2545 + */ 2546 + list_for_each_entry(skb, &msk->backlog_list, list) { 2547 + if (skb->sk != ssk) 2548 + continue; 2549 + 2550 + atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc); 2551 + skb->sk = NULL; 2552 + } 2609 2553 2610 2554 /* subflow aborted before reaching the fully_established status 2611 2555 * attempt the creation of the next subflow ··· 2847 2769 unlock_sock_fast(ssk, slow); 2848 2770 } 2849 2771 2772 + static void mptcp_backlog_purge(struct sock *sk) 2773 + { 2774 + struct mptcp_sock *msk = mptcp_sk(sk); 2775 + struct sk_buff *tmp, *skb; 2776 + LIST_HEAD(backlog); 2777 + 2778 + mptcp_data_lock(sk); 2779 + list_splice_init(&msk->backlog_list, &backlog); 2780 + msk->backlog_len = 0; 2781 + mptcp_data_unlock(sk); 2782 + 2783 + list_for_each_entry_safe(skb, tmp, &backlog, list) { 2784 + mptcp_borrow_fwdmem(sk, skb); 2785 + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2786 + } 2787 + sk_mem_reclaim(sk); 2788 + } 2789 + 2850 2790 static void mptcp_do_fastclose(struct sock *sk) 2851 2791 { 2852 2792 struct mptcp_subflow_context *subflow, *tmp; 2853 2793 struct mptcp_sock *msk = mptcp_sk(sk); 2854 2794 2855 2795 mptcp_set_state(sk, TCP_CLOSE); 2796 + mptcp_backlog_purge(sk); 2856 2797 2857 2798 /* Explicitly send the fastclose reset as need */ 2858 2799 if (__mptcp_check_fallback(msk)) ··· 2950 2853 INIT_LIST_HEAD(&msk->conn_list); 2951 2854 INIT_LIST_HEAD(&msk->join_list); 2952 2855 INIT_LIST_HEAD(&msk->rtx_queue); 2856 + INIT_LIST_HEAD(&msk->backlog_list); 2953 2857 INIT_WORK(&msk->work, mptcp_worker); 2954 2858 msk->out_of_order_queue = RB_ROOT; 2955 2859 msk->first_pending = NULL; 2956 2860 msk->timer_ival = TCP_RTO_MIN; 2957 2861 msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; 2862 + msk->backlog_len = 0; 2958 2863 2959 2864 WRITE_ONCE(msk->first, NULL); 2960 2865 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; ··· 3327 3228 inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; 3328 3229 } 3329 3230 3231 + static void mptcp_destroy_common(struct mptcp_sock *msk) 3232 + { 3233 + struct mptcp_subflow_context *subflow, *tmp; 3234 + struct sock *sk = (struct sock *)msk; 3235 + 3236 + __mptcp_clear_xmit(sk); 3237 + mptcp_backlog_purge(sk); 3238 + 3239 + /* join list will be eventually flushed (with rst) at sock lock release time */ 3240 + mptcp_for_each_subflow_safe(msk, subflow, tmp) 3241 + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); 3242 + 3243 + __skb_queue_purge(&sk->sk_receive_queue); 3244 + skb_rbtree_purge(&msk->out_of_order_queue); 3245 + 3246 + /* move all the rx fwd alloc into the sk_mem_reclaim_final in 3247 + * inet_sock_destruct() will dispose it 3248 + */ 3249 + mptcp_token_destroy(msk); 3250 + mptcp_pm_destroy(msk); 3251 + } 3252 + 3330 3253 static int mptcp_disconnect(struct sock *sk, int flags) 3331 3254 { 3332 3255 struct mptcp_sock *msk = mptcp_sk(sk); ··· 3400 3279 msk->bytes_sent = 0; 3401 3280 msk->bytes_retrans = 0; 3402 3281 msk->rcvspace_init = 0; 3282 + 3283 + /* for fallback's sake */ 3284 + WRITE_ONCE(msk->ack_seq, 0); 3403 3285 3404 3286 WRITE_ONCE(sk->sk_shutdown, 0); 3405 3287 sk_error_report(sk); ··· 3554 3430 msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; 3555 3431 } 3556 3432 3557 - void mptcp_destroy_common(struct mptcp_sock *msk) 3558 - { 3559 - struct mptcp_subflow_context *subflow, *tmp; 3560 - struct sock *sk = (struct sock *)msk; 3561 - 3562 - __mptcp_clear_xmit(sk); 3563 - 3564 - /* join list will be eventually flushed (with rst) at sock lock release time */ 3565 - mptcp_for_each_subflow_safe(msk, subflow, tmp) 3566 - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); 3567 - 3568 - __skb_queue_purge(&sk->sk_receive_queue); 3569 - skb_rbtree_purge(&msk->out_of_order_queue); 3570 - 3571 - /* move all the rx fwd alloc into the sk_mem_reclaim_final in 3572 - * inet_sock_destruct() will dispose it 3573 - */ 3574 - mptcp_token_destroy(msk); 3575 - mptcp_pm_destroy(msk); 3576 - } 3577 - 3578 3433 static void mptcp_destroy(struct sock *sk) 3579 3434 { 3580 3435 struct mptcp_sock *msk = mptcp_sk(sk); ··· 3582 3479 3583 3480 #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ 3584 3481 BIT(MPTCP_RETRANSMIT) | \ 3585 - BIT(MPTCP_FLUSH_JOIN_LIST) | \ 3586 - BIT(MPTCP_DEQUEUE)) 3482 + BIT(MPTCP_FLUSH_JOIN_LIST)) 3587 3483 3588 3484 /* processes deferred events and flush wmem */ 3589 3485 static void mptcp_release_cb(struct sock *sk) ··· 3592 3490 3593 3491 for (;;) { 3594 3492 unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); 3595 - struct list_head join_list; 3493 + struct list_head join_list, skbs; 3494 + bool spool_bl; 3495 + u32 moved; 3596 3496 3597 - if (!flags) 3497 + spool_bl = mptcp_can_spool_backlog(sk, &skbs); 3498 + if (!flags && !spool_bl) 3598 3499 break; 3599 3500 3600 3501 INIT_LIST_HEAD(&join_list); ··· 3619 3514 __mptcp_push_pending(sk, 0); 3620 3515 if (flags & BIT(MPTCP_RETRANSMIT)) 3621 3516 __mptcp_retrans(sk); 3622 - if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { 3517 + if (spool_bl && __mptcp_move_skbs(sk, &skbs, &moved)) { 3623 3518 /* notify ack seq update */ 3624 3519 mptcp_cleanup_rbuf(msk, 0); 3625 3520 sk->sk_data_ready(sk); ··· 3627 3522 3628 3523 cond_resched(); 3629 3524 spin_lock_bh(&sk->sk_lock.slock); 3525 + if (spool_bl) 3526 + mptcp_backlog_spooled(sk, moved, &skbs); 3630 3527 } 3631 3528 3632 3529 if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) ··· 3754 3647 write_unlock_bh(&sk->sk_callback_lock); 3755 3648 } 3756 3649 3650 + /* Can be called without holding the msk socket lock; use the callback lock 3651 + * to avoid {READ_,WRITE_}ONCE annotations on sk_socket. 3652 + */ 3653 + static void mptcp_sock_check_graft(struct sock *sk, struct sock *ssk) 3654 + { 3655 + struct socket *sock; 3656 + 3657 + write_lock_bh(&sk->sk_callback_lock); 3658 + sock = sk->sk_socket; 3659 + write_unlock_bh(&sk->sk_callback_lock); 3660 + if (sock) { 3661 + mptcp_sock_graft(ssk, sock); 3662 + __mptcp_inherit_cgrp_data(sk, ssk); 3663 + __mptcp_inherit_memcg(sk, ssk, GFP_ATOMIC); 3664 + } 3665 + } 3666 + 3757 3667 bool mptcp_finish_join(struct sock *ssk) 3758 3668 { 3759 3669 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); ··· 3786 3662 return false; 3787 3663 } 3788 3664 3789 - /* active subflow, already present inside the conn_list */ 3665 + /* Active subflow, already present inside the conn_list; is grafted 3666 + * either by __mptcp_subflow_connect() or accept. 3667 + */ 3790 3668 if (!list_empty(&subflow->node)) { 3791 3669 spin_lock_bh(&msk->fallback_lock); 3792 3670 if (!msk->allow_subflows) { ··· 3815 3689 if (ret) { 3816 3690 sock_hold(ssk); 3817 3691 list_add_tail(&subflow->node, &msk->conn_list); 3692 + mptcp_sock_check_graft(parent, ssk); 3818 3693 } 3819 3694 } else { 3820 3695 sock_hold(ssk); 3821 3696 list_add_tail(&subflow->node, &msk->join_list); 3822 3697 __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); 3698 + 3699 + /* In case of later failures, __mptcp_flush_join_list() will 3700 + * properly orphan the ssk via mptcp_close_ssk(). 3701 + */ 3702 + mptcp_sock_check_graft(parent, ssk); 3823 3703 } 3824 3704 mptcp_data_unlock(parent); 3825 3705 ··· 3886 3754 return -EINVAL; 3887 3755 3888 3756 lock_sock(sk); 3889 - if (__mptcp_move_skbs(sk)) 3757 + if (mptcp_move_skbs(sk)) 3890 3758 mptcp_cleanup_rbuf(msk, 0); 3891 3759 *karg = mptcp_inq_hint(sk); 3892 3760 release_sock(sk); ··· 4086 3954 return err; 4087 3955 } 4088 3956 3957 + static void mptcp_graft_subflows(struct sock *sk) 3958 + { 3959 + struct mptcp_subflow_context *subflow; 3960 + struct mptcp_sock *msk = mptcp_sk(sk); 3961 + 3962 + if (mem_cgroup_sockets_enabled) { 3963 + LIST_HEAD(join_list); 3964 + 3965 + /* Subflows joining after __inet_accept() will get the 3966 + * mem CG properly initialized at mptcp_finish_join() time, 3967 + * but subflows pending in join_list need explicit 3968 + * initialization before flushing `backlog_unaccounted` 3969 + * or MPTCP can later unexpectedly observe unaccounted memory. 3970 + */ 3971 + mptcp_data_lock(sk); 3972 + list_splice_init(&msk->join_list, &join_list); 3973 + mptcp_data_unlock(sk); 3974 + 3975 + __mptcp_flush_join_list(sk, &join_list); 3976 + } 3977 + 3978 + mptcp_for_each_subflow(msk, subflow) { 3979 + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 3980 + 3981 + lock_sock(ssk); 3982 + 3983 + /* Set ssk->sk_socket of accept()ed flows to mptcp socket. 3984 + * This is needed so NOSPACE flag can be set from tcp stack. 3985 + */ 3986 + if (!ssk->sk_socket) 3987 + mptcp_sock_graft(ssk, sk->sk_socket); 3988 + 3989 + if (!mem_cgroup_sk_enabled(sk)) 3990 + goto unlock; 3991 + 3992 + __mptcp_inherit_cgrp_data(sk, ssk); 3993 + __mptcp_inherit_memcg(sk, ssk, GFP_KERNEL); 3994 + 3995 + unlock: 3996 + release_sock(ssk); 3997 + } 3998 + 3999 + if (mem_cgroup_sk_enabled(sk)) { 4000 + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; 4001 + int amt; 4002 + 4003 + /* Account the backlog memory; prior accept() is aware of 4004 + * fwd and rmem only. 4005 + */ 4006 + mptcp_data_lock(sk); 4007 + amt = sk_mem_pages(sk->sk_forward_alloc + 4008 + msk->backlog_unaccounted + 4009 + atomic_read(&sk->sk_rmem_alloc)) - 4010 + sk_mem_pages(sk->sk_forward_alloc + 4011 + atomic_read(&sk->sk_rmem_alloc)); 4012 + msk->backlog_unaccounted = 0; 4013 + mptcp_data_unlock(sk); 4014 + 4015 + if (amt) 4016 + mem_cgroup_sk_charge(sk, amt, gfp); 4017 + } 4018 + } 4019 + 4089 4020 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, 4090 4021 struct proto_accept_arg *arg) 4091 4022 { ··· 4196 4001 msk = mptcp_sk(newsk); 4197 4002 msk->in_accept_queue = 0; 4198 4003 4199 - /* set ssk->sk_socket of accept()ed flows to mptcp socket. 4200 - * This is needed so NOSPACE flag can be set from tcp stack. 4201 - */ 4202 - mptcp_for_each_subflow(msk, subflow) { 4203 - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 4204 - 4205 - if (!ssk->sk_socket) 4206 - mptcp_sock_graft(ssk, newsock); 4207 - } 4208 - 4004 + mptcp_graft_subflows(newsk); 4209 4005 mptcp_rps_record_subflows(msk); 4210 4006 4211 4007 /* Do late cleanup for the first subflow as necessary. Also 4212 4008 * deal with bad peers not doing a complete shutdown. 4213 4009 */ 4214 4010 if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { 4215 - __mptcp_close_ssk(newsk, msk->first, 4216 - mptcp_subflow_ctx(msk->first), 0); 4217 4011 if (unlikely(list_is_singular(&msk->conn_list))) 4218 4012 mptcp_set_state(newsk, TCP_CLOSE); 4013 + mptcp_close_ssk(newsk, msk->first, 4014 + mptcp_subflow_ctx(msk->first)); 4219 4015 } 4220 4016 } else { 4221 4017 tcpfallback:
+47 -4
net/mptcp/protocol.h
··· 124 124 #define MPTCP_FLUSH_JOIN_LIST 5 125 125 #define MPTCP_SYNC_STATE 6 126 126 #define MPTCP_SYNC_SNDBUF 7 127 - #define MPTCP_DEQUEUE 8 128 127 129 128 struct mptcp_skb_cb { 130 129 u64 map_seq; ··· 356 357 * allow_infinite_fallback and 357 358 * allow_join 358 359 */ 360 + 361 + struct list_head backlog_list; /* protected by the data lock */ 362 + u32 backlog_len; 363 + u32 backlog_unaccounted; 359 364 }; 360 365 361 366 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) ··· 410 407 static inline int __mptcp_space(const struct sock *sk) 411 408 { 412 409 return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - 410 + READ_ONCE(mptcp_sk(sk)->backlog_len) - 413 411 sk_rmem_alloc_get(sk)); 414 412 } 415 413 ··· 540 536 send_infinite_map : 1, 541 537 remote_key_valid : 1, /* received the peer key from */ 542 538 disposable : 1, /* ctx can be free at ulp release time */ 539 + closing : 1, /* must not pass rx data to msk anymore */ 543 540 stale : 1, /* unable to snd/rcv data, do not use for xmit */ 544 541 valid_csum_seen : 1, /* at least one csum validated */ 545 542 is_mptfo : 1, /* subflow is doing TFO */ 546 543 close_event_done : 1, /* has done the post-closed part */ 547 544 mpc_drop : 1, /* the MPC option has been dropped in a rtx */ 548 - __unused : 9; 545 + __unused : 8; 549 546 bool data_avail; 550 547 bool scheduled; 551 548 bool pm_listener; /* a listener managed by the kernel PM? */ 552 549 bool fully_established; /* path validated */ 550 + u32 lent_mem_frag; 553 551 u32 remote_nonce; 554 552 u64 thmac; 555 553 u32 local_nonce; ··· 651 645 tcp_send_active_reset(sk, GFP_ATOMIC, reason); 652 646 } 653 647 648 + /* Made the fwd mem carried by the given skb available to the msk, 649 + * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing 650 + * the skb or setting the skb ownership. 651 + */ 652 + static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb) 653 + { 654 + struct sock *ssk = skb->sk; 655 + 656 + /* The subflow just lend the skb fwd memory; if the subflow meanwhile 657 + * closed, mptcp_close_ssk() already released the ssk rcv memory. 658 + */ 659 + DEBUG_NET_WARN_ON_ONCE(skb->destructor); 660 + sk_forward_alloc_add(sk, skb->truesize); 661 + if (!ssk) 662 + return; 663 + 664 + atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); 665 + skb->sk = NULL; 666 + } 667 + 668 + static inline void 669 + __mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size) 670 + { 671 + int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1); 672 + 673 + subflow->lent_mem_frag = frag; 674 + } 675 + 676 + static inline void 677 + mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, 678 + struct sk_buff *skb) 679 + { 680 + __mptcp_subflow_lend_fwdmem(subflow, skb->truesize); 681 + skb->destructor = NULL; 682 + } 683 + 654 684 static inline u64 655 685 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) 656 686 { ··· 748 706 local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); 749 707 return ret; 750 708 } 709 + 710 + void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp); 711 + void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk); 751 712 752 713 int mptcp_is_enabled(const struct net *net); 753 714 unsigned int mptcp_get_add_addr_timeout(const struct net *net); ··· 1021 976 mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); 1022 977 local_bh_enable(); 1023 978 } 1024 - 1025 - void mptcp_destroy_common(struct mptcp_sock *msk); 1026 979 1027 980 #define MPTCP_TOKEN_MAX_RETRIES 4 1028 981
+32 -10
net/mptcp/subflow.c
··· 491 491 mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); 492 492 subflow->iasn++; 493 493 494 + /* for fallback's sake */ 495 + subflow->map_seq = subflow->iasn; 496 + 494 497 WRITE_ONCE(msk->remote_key, subflow->remote_key); 495 498 WRITE_ONCE(msk->ack_seq, subflow->iasn); 496 499 WRITE_ONCE(msk->can_ack, true); ··· 1288 1285 /* sched mptcp worker for subflow cleanup if no more data is pending */ 1289 1286 static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) 1290 1287 { 1288 + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1291 1289 struct sock *sk = (struct sock *)msk; 1292 1290 1293 1291 if (likely(ssk->sk_state != TCP_CLOSE && ··· 1307 1303 */ 1308 1304 if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && 1309 1305 msk->first == ssk && 1310 - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) 1306 + mptcp_update_rcv_data_fin(msk, subflow->map_seq + 1307 + subflow->map_data_len, true)) 1311 1308 mptcp_schedule_work(sk); 1312 1309 } 1313 1310 ··· 1438 1433 1439 1434 skb = skb_peek(&ssk->sk_receive_queue); 1440 1435 subflow->map_valid = 1; 1441 - subflow->map_seq = READ_ONCE(msk->ack_seq); 1442 1436 subflow->map_data_len = skb->len; 1443 1437 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 1438 + subflow->map_seq = __mptcp_expand_seq(subflow->map_seq, 1439 + subflow->iasn + 1440 + TCP_SKB_CB(skb)->seq - 1441 + subflow->ssn_offset - 1); 1444 1442 WRITE_ONCE(subflow->data_avail, true); 1445 1443 return true; 1446 1444 } ··· 1720 1712 return err; 1721 1713 } 1722 1714 1723 - static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) 1715 + void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp) 1716 + { 1717 + /* Only if the msk has been accepted already (and not orphaned).*/ 1718 + if (!mem_cgroup_sockets_enabled || !sk->sk_socket) 1719 + return; 1720 + 1721 + mem_cgroup_sk_inherit(sk, ssk); 1722 + __sk_charge(ssk, gfp); 1723 + } 1724 + 1725 + void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk) 1724 1726 { 1725 1727 #ifdef CONFIG_SOCK_CGROUP_DATA 1726 - struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, 1727 - *child_skcd = &child->sk_cgrp_data; 1728 + struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data, 1729 + *ssk_cd = &ssk->sk_cgrp_data; 1728 1730 1729 1731 /* only the additional subflows created by kworkers have to be modified */ 1730 - if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != 1731 - cgroup_id(sock_cgroup_ptr(child_skcd))) { 1732 - cgroup_sk_free(child_skcd); 1733 - *child_skcd = *parent_skcd; 1734 - cgroup_sk_clone(child_skcd); 1732 + if (cgroup_id(sock_cgroup_ptr(sk_cd)) != 1733 + cgroup_id(sock_cgroup_ptr(ssk_cd))) { 1734 + cgroup_sk_free(ssk_cd); 1735 + *ssk_cd = *sk_cd; 1736 + cgroup_sk_clone(sk_cd); 1735 1737 } 1736 1738 #endif /* CONFIG_SOCK_CGROUP_DATA */ 1739 + } 1737 1740 1741 + static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) 1742 + { 1743 + __mptcp_inherit_cgrp_data(parent, child); 1738 1744 if (mem_cgroup_sockets_enabled) 1739 1745 mem_cgroup_sk_inherit(parent, child); 1740 1746 }