Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-lowat-sockopt'

Matthieu Baerts says:

====================
mptcp: add TCP_NOTSENT_LOWAT sockopt support

Patch 3 does the magic of adding TCP_NOTSENT_LOWAT support, all the
other ones are minor cleanup seen along when working on the new
feature.

Note that this feature relies on the existing accounting for snd_nxt.
Such accounting is not 110% accurate as it tracks the most recent
sequence number queued to any subflow, and not the actual sequence
number sent on the wire. Paolo experimented a lot, trying to implement
the latter, and in the end it proved to be both "too complex" and "not
necessary".

The complexity raises from the need for additional lock and a lot of
refactoring to introduce such protections without adding significant
overhead. Additionally, snd_nxt is currently used and exposed with the
current semantic by the internal packet scheduling. Introducing a
different tracking will still require us to keep the old one.

More interestingly, a more accurate tracking could be not strictly
necessary: as the MPTCP socket enqueues data to the subflows only up
to the available send window, any enqueue data is sent on the wire
instantly, without any blocking operation short or a drop in the tx
path at the nft or TC layer.
====================

Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>

+101 -66
+37 -17
net/mptcp/protocol.c
··· 1692 1692 } 1693 1693 } 1694 1694 1695 - static void mptcp_set_nospace(struct sock *sk) 1696 - { 1697 - /* enable autotune */ 1698 - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1699 - 1700 - /* will be cleared on avail space */ 1701 - set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); 1702 - } 1703 - 1704 1695 static int mptcp_disconnect(struct sock *sk, int flags); 1705 1696 1706 1697 static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, ··· 1762 1771 return 0; 1763 1772 } 1764 1773 1774 + /* open-code sk_stream_memory_free() plus sent limit computation to 1775 + * avoid indirect calls in fast-path. 1776 + * Called under the msk socket lock, so we can avoid a bunch of ONCE 1777 + * annotations. 1778 + */ 1779 + static u32 mptcp_send_limit(const struct sock *sk) 1780 + { 1781 + const struct mptcp_sock *msk = mptcp_sk(sk); 1782 + u32 limit, not_sent; 1783 + 1784 + if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) 1785 + return 0; 1786 + 1787 + limit = mptcp_notsent_lowat(sk); 1788 + if (limit == UINT_MAX) 1789 + return UINT_MAX; 1790 + 1791 + not_sent = msk->write_seq - msk->snd_nxt; 1792 + if (not_sent >= limit) 1793 + return 0; 1794 + 1795 + return limit - not_sent; 1796 + } 1797 + 1765 1798 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) 1766 1799 { 1767 1800 struct mptcp_sock *msk = mptcp_sk(sk); ··· 1830 1815 struct mptcp_data_frag *dfrag; 1831 1816 bool dfrag_collapsed; 1832 1817 size_t psize, offset; 1818 + u32 copy_limit; 1819 + 1820 + /* ensure fitting the notsent_lowat() constraint */ 1821 + copy_limit = mptcp_send_limit(sk); 1822 + if (!copy_limit) 1823 + goto wait_for_memory; 1833 1824 1834 1825 /* reuse tail pfrag, if possible, or carve a new one from the 1835 1826 * page allocator ··· 1843 1822 dfrag = mptcp_pending_tail(sk); 1844 1823 dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); 1845 1824 if (!dfrag_collapsed) { 1846 - if (!sk_stream_memory_free(sk)) 1847 - goto wait_for_memory; 1848 - 1849 1825 if (!mptcp_page_frag_refill(sk, pfrag)) 1850 1826 goto wait_for_memory; 1851 1827 ··· 1857 1839 offset = dfrag->offset + dfrag->data_len; 1858 1840 psize = pfrag->size - offset; 1859 1841 psize = min_t(size_t, psize, msg_data_left(msg)); 1842 + psize = min_t(size_t, psize, copy_limit); 1860 1843 total_ts = psize + frag_truesize; 1861 1844 1862 1845 if (!sk_wmem_schedule(sk, total_ts)) ··· 1893 1874 continue; 1894 1875 1895 1876 wait_for_memory: 1896 - mptcp_set_nospace(sk); 1877 + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1897 1878 __mptcp_push_pending(sk, msg->msg_flags); 1898 1879 ret = sk_stream_wait_memory(sk, &timeo); 1899 1880 if (ret) ··· 3788 3769 .unhash = mptcp_unhash, 3789 3770 .get_port = mptcp_get_port, 3790 3771 .forward_alloc_get = mptcp_forward_alloc_get, 3772 + .stream_memory_free = mptcp_stream_memory_free, 3791 3773 .sockets_allocated = &mptcp_sockets_allocated, 3792 3774 3793 3775 .memory_allocated = &tcp_memory_allocated, ··· 3962 3942 { 3963 3943 struct sock *sk = (struct sock *)msk; 3964 3944 3965 - if (sk_stream_is_writeable(sk)) 3945 + if (__mptcp_stream_is_writeable(sk, 1)) 3966 3946 return EPOLLOUT | EPOLLWRNORM; 3967 3947 3968 - mptcp_set_nospace(sk); 3969 - smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ 3970 - if (sk_stream_is_writeable(sk)) 3948 + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 3949 + smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ 3950 + if (__mptcp_stream_is_writeable(sk, 1)) 3971 3951 return EPOLLOUT | EPOLLWRNORM; 3972 3952 3973 3953 return 0;
+32 -10
net/mptcp/protocol.h
··· 113 113 #define MPTCP_RST_TRANSIENT BIT(0) 114 114 115 115 /* MPTCP socket atomic flags */ 116 - #define MPTCP_NOSPACE 1 117 - #define MPTCP_WORK_RTX 2 118 - #define MPTCP_FALLBACK_DONE 4 119 - #define MPTCP_WORK_CLOSE_SUBFLOW 5 116 + #define MPTCP_WORK_RTX 1 117 + #define MPTCP_FALLBACK_DONE 2 118 + #define MPTCP_WORK_CLOSE_SUBFLOW 3 120 119 121 120 /* MPTCP socket release cb flags */ 122 121 #define MPTCP_PUSH_PENDING 1 ··· 307 308 in_accept_queue:1, 308 309 free_first:1, 309 310 rcvspace_init:1; 311 + u32 notsent_lowat; 310 312 struct work_struct work; 311 313 struct sk_buff *ooo_last_skb; 312 314 struct rb_root out_of_order_queue; ··· 808 808 READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); 809 809 } 810 810 811 + static inline u32 mptcp_notsent_lowat(const struct sock *sk) 812 + { 813 + struct net *net = sock_net(sk); 814 + u32 val; 815 + 816 + val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); 817 + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); 818 + } 819 + 820 + static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) 821 + { 822 + const struct mptcp_sock *msk = mptcp_sk(sk); 823 + u32 notsent_bytes; 824 + 825 + notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); 826 + return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); 827 + } 828 + 829 + static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) 830 + { 831 + return mptcp_stream_memory_free(sk, wake) && 832 + __sk_stream_is_writeable(sk, wake); 833 + } 834 + 811 835 static inline void mptcp_write_space(struct sock *sk) 812 836 { 813 - if (sk_stream_is_writeable(sk)) { 814 - /* pairs with memory barrier in mptcp_poll */ 815 - smp_mb(); 816 - if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags)) 817 - sk_stream_write_space(sk); 818 - } 837 + /* pairs with memory barrier in mptcp_poll */ 838 + smp_mb(); 839 + if (mptcp_stream_memory_free(sk, 1)) 840 + sk_stream_write_space(sk); 819 841 } 820 842 821 843 static inline void __mptcp_sync_sndbuf(struct sock *sk)
+32 -39
net/mptcp/sockopt.c
··· 624 624 return ret; 625 625 } 626 626 627 - static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval, 628 - unsigned int optlen) 627 + static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val) 629 628 { 630 629 struct mptcp_subflow_context *subflow; 631 630 struct sock *sk = (struct sock *)msk; 632 - int val; 633 631 634 - if (optlen < sizeof(int)) 635 - return -EINVAL; 636 - 637 - if (copy_from_sockptr(&val, optval, sizeof(val))) 638 - return -EFAULT; 639 - 640 - lock_sock(sk); 641 632 sockopt_seq_inc(msk); 642 633 msk->cork = !!val; 643 634 mptcp_for_each_subflow(msk, subflow) { ··· 640 649 } 641 650 if (!val) 642 651 mptcp_check_and_set_pending(sk); 643 - release_sock(sk); 644 652 645 653 return 0; 646 654 } 647 655 648 - static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval, 649 - unsigned int optlen) 656 + static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val) 650 657 { 651 658 struct mptcp_subflow_context *subflow; 652 659 struct sock *sk = (struct sock *)msk; 653 - int val; 654 660 655 - if (optlen < sizeof(int)) 656 - return -EINVAL; 657 - 658 - if (copy_from_sockptr(&val, optval, sizeof(val))) 659 - return -EFAULT; 660 - 661 - lock_sock(sk); 662 661 sockopt_seq_inc(msk); 663 662 msk->nodelay = !!val; 664 663 mptcp_for_each_subflow(msk, subflow) { ··· 660 679 } 661 680 if (val) 662 681 mptcp_check_and_set_pending(sk); 663 - release_sock(sk); 664 - 665 682 return 0; 666 683 } 667 684 ··· 782 803 int ret, val; 783 804 784 805 switch (optname) { 785 - case TCP_INQ: 786 - ret = mptcp_get_int_option(msk, optval, optlen, &val); 787 - if (ret) 788 - return ret; 789 - if (val < 0 || val > 1) 790 - return -EINVAL; 791 - 792 - lock_sock(sk); 793 - msk->recvmsg_inq = !!val; 794 - release_sock(sk); 795 - return 0; 796 806 case TCP_ULP: 797 807 return -EOPNOTSUPP; 798 808 case TCP_CONGESTION: 799 809 return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); 800 - case TCP_CORK: 801 - return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen); 802 - case TCP_NODELAY: 803 - return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); 804 810 case TCP_DEFER_ACCEPT: 805 811 /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ 806 812 mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); ··· 798 834 optval, optlen); 799 835 } 800 836 801 - return -EOPNOTSUPP; 837 + ret = mptcp_get_int_option(msk, optval, optlen, &val); 838 + if (ret) 839 + return ret; 840 + 841 + lock_sock(sk); 842 + switch (optname) { 843 + case TCP_INQ: 844 + if (val < 0 || val > 1) 845 + ret = -EINVAL; 846 + else 847 + msk->recvmsg_inq = !!val; 848 + break; 849 + case TCP_NOTSENT_LOWAT: 850 + WRITE_ONCE(msk->notsent_lowat, val); 851 + mptcp_write_space(sk); 852 + break; 853 + case TCP_CORK: 854 + ret = __mptcp_setsockopt_sol_tcp_cork(msk, val); 855 + break; 856 + case TCP_NODELAY: 857 + ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val); 858 + break; 859 + default: 860 + ret = -ENOPROTOOPT; 861 + } 862 + 863 + release_sock(sk); 864 + return ret; 802 865 } 803 866 804 867 int mptcp_setsockopt(struct sock *sk, int level, int optname, ··· 1340 1349 return mptcp_put_int_option(msk, optval, optlen, msk->cork); 1341 1350 case TCP_NODELAY: 1342 1351 return mptcp_put_int_option(msk, optval, optlen, msk->nodelay); 1352 + case TCP_NOTSENT_LOWAT: 1353 + return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat); 1343 1354 } 1344 1355 return -EOPNOTSUPP; 1345 1356 }