Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-prepare-mptcp-packet-scheduler-for-bpf-extension'

Mat Martineau says:

====================
mptcp: Prepare MPTCP packet scheduler for BPF extension

The kernel's MPTCP packet scheduler has, to date, been a one-size-fits
all algorithm that is hard-coded. It attempts to balance latency and
throughput when transmitting data across multiple TCP subflows, and has
some limited tunability through sysctls. It has been a long-term goal of
the Linux MPTCP community to support customizable packet schedulers for
use cases that need to make different trade-offs regarding latency,
throughput, redundancy, and other metrics. BPF is well-suited for
configuring customized, per-packet scheduling decisions without having
to modify the kernel or manage out-of-tree kernel modules.

The first steps toward implementing BPF packet schedulers are to update
the existing MPTCP transmit loops to allow more flexible scheduling
decisions, and to add infrastructure for swappable packet schedulers.
The existing scheduling algorithm remains the default. BPF-related
changes will be in a future patch series.

This code has been in the MPTCP development tree for quite a while,
undergoing testing in our CI and community.

Patches 1 and 2 refactor the transmit code and do some related cleanup.

Patches 3-9 add infrastructure for registering and calling multiple
schedulers.

Patch 10 connects the in-kernel default scheduler to the new
infrastructure.
====================

Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-0-0c860fb256a8@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+398 -137
+8
Documentation/networking/mptcp-sysctl.rst
··· 74 74 This is a per-namespace sysctl. 75 75 76 76 Default: 4 77 + 78 + scheduler - STRING 79 + Select the scheduler of your choice. 80 + 81 + Support for selection of different schedulers. This is a per-namespace 82 + sysctl. 83 + 84 + Default: "default"
+21
include/net/mptcp.h
··· 96 96 #endif 97 97 }; 98 98 99 + #define MPTCP_SCHED_NAME_MAX 16 100 + #define MPTCP_SUBFLOWS_MAX 8 101 + 102 + struct mptcp_sched_data { 103 + bool reinject; 104 + u8 subflows; 105 + struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; 106 + }; 107 + 108 + struct mptcp_sched_ops { 109 + int (*get_subflow)(struct mptcp_sock *msk, 110 + struct mptcp_sched_data *data); 111 + 112 + char name[MPTCP_SCHED_NAME_MAX]; 113 + struct module *owner; 114 + struct list_head list; 115 + 116 + void (*init)(struct mptcp_sock *msk); 117 + void (*release)(struct mptcp_sock *msk); 118 + } ____cacheline_aligned_in_smp; 119 + 99 120 #ifdef CONFIG_MPTCP 100 121 void mptcp_init(void); 101 122
+1 -1
net/mptcp/Makefile
··· 2 2 obj-$(CONFIG_MPTCP) += mptcp.o 3 3 4 4 mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ 5 - mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o 5 + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o 6 6 7 7 obj-$(CONFIG_SYN_COOKIES) += syncookies.o 8 8 obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
+14
net/mptcp/ctrl.c
··· 32 32 u8 checksum_enabled; 33 33 u8 allow_join_initial_addr_port; 34 34 u8 pm_type; 35 + char scheduler[MPTCP_SCHED_NAME_MAX]; 35 36 }; 36 37 37 38 static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) ··· 70 69 return mptcp_get_pernet(net)->pm_type; 71 70 } 72 71 72 + const char *mptcp_get_scheduler(const struct net *net) 73 + { 74 + return mptcp_get_pernet(net)->scheduler; 75 + } 76 + 73 77 static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) 74 78 { 75 79 pernet->mptcp_enabled = 1; ··· 83 77 pernet->allow_join_initial_addr_port = 1; 84 78 pernet->stale_loss_cnt = 4; 85 79 pernet->pm_type = MPTCP_PM_TYPE_KERNEL; 80 + strcpy(pernet->scheduler, "default"); 86 81 } 87 82 88 83 #ifdef CONFIG_SYSCTL ··· 135 128 .extra1 = SYSCTL_ZERO, 136 129 .extra2 = &mptcp_pm_type_max 137 130 }, 131 + { 132 + .procname = "scheduler", 133 + .maxlen = MPTCP_SCHED_NAME_MAX, 134 + .mode = 0644, 135 + .proc_handler = proc_dostring, 136 + }, 138 137 {} 139 138 }; 140 139 ··· 162 149 table[3].data = &pernet->allow_join_initial_addr_port; 163 150 table[4].data = &pernet->stale_loss_cnt; 164 151 table[5].data = &pernet->pm_type; 152 + table[6].data = &pernet->scheduler; 165 153 166 154 hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); 167 155 if (!hdr)
+1 -8
net/mptcp/pm.c
··· 299 299 300 300 pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); 301 301 msk = mptcp_sk(sk); 302 - if (subflow->backup != bkup) { 302 + if (subflow->backup != bkup) 303 303 subflow->backup = bkup; 304 - mptcp_data_lock(sk); 305 - if (!sock_owned_by_user(sk)) 306 - msk->last_snd = NULL; 307 - else 308 - __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); 309 - mptcp_data_unlock(sk); 310 - } 311 304 312 305 mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); 313 306 }
-3
net/mptcp/pm_netlink.c
··· 472 472 473 473 slow = lock_sock_fast(ssk); 474 474 if (prio) { 475 - if (subflow->backup != backup) 476 - msk->last_snd = NULL; 477 - 478 475 subflow->send_mp_prio = 1; 479 476 subflow->backup = backup; 480 477 subflow->request_bkup = backup;
+164 -123
net/mptcp/protocol.c
··· 1366 1366 * returns the subflow that will transmit the next DSS 1367 1367 * additionally updates the rtx timeout 1368 1368 */ 1369 - static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 1369 + struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) 1370 1370 { 1371 1371 struct subflow_send_info send_info[SSK_MODE_MAX]; 1372 1372 struct mptcp_subflow_context *subflow; ··· 1376 1376 struct sock *ssk; 1377 1377 u64 linger_time; 1378 1378 long tout = 0; 1379 - 1380 - msk_owned_by_me(msk); 1381 - 1382 - if (__mptcp_check_fallback(msk)) { 1383 - if (!msk->first) 1384 - return NULL; 1385 - return __tcp_can_send(msk->first) && 1386 - sk_stream_memory_free(msk->first) ? msk->first : NULL; 1387 - } 1388 - 1389 - /* re-use last subflow, if the burst allow that */ 1390 - if (msk->last_snd && msk->snd_burst > 0 && 1391 - sk_stream_memory_free(msk->last_snd) && 1392 - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { 1393 - mptcp_set_timeout(sk); 1394 - return msk->last_snd; 1395 - } 1396 1379 1397 1380 /* pick the subflow with the lower wmem/wspace ratio */ 1398 1381 for (i = 0; i < SSK_MODE_MAX; ++i) { ··· 1429 1446 1430 1447 burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); 1431 1448 wmem = READ_ONCE(ssk->sk_wmem_queued); 1432 - if (!burst) { 1433 - msk->last_snd = NULL; 1449 + if (!burst) 1434 1450 return ssk; 1435 - } 1436 1451 1437 1452 subflow = mptcp_subflow_ctx(ssk); 1438 1453 subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + 1439 1454 READ_ONCE(ssk->sk_pacing_rate) * burst, 1440 1455 burst + wmem); 1441 - msk->last_snd = ssk; 1442 1456 msk->snd_burst = burst; 1443 1457 return ssk; 1444 1458 } ··· 1479 1499 mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); 1480 1500 } 1481 1501 1502 + static int __subflow_push_pending(struct sock *sk, struct sock *ssk, 1503 + struct mptcp_sendmsg_info *info) 1504 + { 1505 + struct mptcp_sock *msk = mptcp_sk(sk); 1506 + struct mptcp_data_frag *dfrag; 1507 + int len, copied = 0, err = 0; 1508 + 1509 + while ((dfrag = mptcp_send_head(sk))) { 1510 + info->sent = dfrag->already_sent; 1511 + info->limit = dfrag->data_len; 1512 + len = dfrag->data_len - dfrag->already_sent; 1513 + while (len > 0) { 1514 + int ret = 0; 1515 + 1516 + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); 1517 + if (ret <= 0) { 1518 + err = copied ? : ret; 1519 + goto out; 1520 + } 1521 + 1522 + info->sent += ret; 1523 + copied += ret; 1524 + len -= ret; 1525 + 1526 + mptcp_update_post_push(msk, dfrag, ret); 1527 + } 1528 + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1529 + 1530 + if (msk->snd_burst <= 0 || 1531 + !sk_stream_memory_free(ssk) || 1532 + !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { 1533 + err = copied; 1534 + goto out; 1535 + } 1536 + mptcp_set_timeout(sk); 1537 + } 1538 + err = copied; 1539 + 1540 + out: 1541 + return err; 1542 + } 1543 + 1482 1544 void __mptcp_push_pending(struct sock *sk, unsigned int flags) 1483 1545 { 1484 1546 struct sock *prev_ssk = NULL, *ssk = NULL; ··· 1529 1507 .flags = flags, 1530 1508 }; 1531 1509 bool do_check_data_fin = false; 1532 - struct mptcp_data_frag *dfrag; 1533 - int len; 1510 + int push_count = 1; 1534 1511 1535 - while ((dfrag = mptcp_send_head(sk))) { 1536 - info.sent = dfrag->already_sent; 1537 - info.limit = dfrag->data_len; 1538 - len = dfrag->data_len - dfrag->already_sent; 1539 - while (len > 0) { 1540 - int ret = 0; 1512 + while (mptcp_send_head(sk) && (push_count > 0)) { 1513 + struct mptcp_subflow_context *subflow; 1514 + int ret = 0; 1541 1515 1542 - prev_ssk = ssk; 1543 - ssk = mptcp_subflow_get_send(msk); 1516 + if (mptcp_sched_get_send(msk)) 1517 + break; 1544 1518 1545 - /* First check. If the ssk has changed since 1546 - * the last round, release prev_ssk 1547 - */ 1548 - if (ssk != prev_ssk && prev_ssk) 1549 - mptcp_push_release(prev_ssk, &info); 1550 - if (!ssk) 1551 - goto out; 1519 + push_count = 0; 1552 1520 1553 - /* Need to lock the new subflow only if different 1554 - * from the previous one, otherwise we are still 1555 - * helding the relevant lock 1556 - */ 1557 - if (ssk != prev_ssk) 1558 - lock_sock(ssk); 1521 + mptcp_for_each_subflow(msk, subflow) { 1522 + if (READ_ONCE(subflow->scheduled)) { 1523 + mptcp_subflow_set_scheduled(subflow, false); 1559 1524 1560 - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1561 - if (ret <= 0) { 1562 - if (ret == -EAGAIN) 1525 + prev_ssk = ssk; 1526 + ssk = mptcp_subflow_tcp_sock(subflow); 1527 + if (ssk != prev_ssk) { 1528 + /* First check. If the ssk has changed since 1529 + * the last round, release prev_ssk 1530 + */ 1531 + if (prev_ssk) 1532 + mptcp_push_release(prev_ssk, &info); 1533 + 1534 + /* Need to lock the new subflow only if different 1535 + * from the previous one, otherwise we are still 1536 + * helding the relevant lock 1537 + */ 1538 + lock_sock(ssk); 1539 + } 1540 + 1541 + push_count++; 1542 + 1543 + ret = __subflow_push_pending(sk, ssk, &info); 1544 + if (ret <= 0) { 1545 + if (ret != -EAGAIN || 1546 + (1 << ssk->sk_state) & 1547 + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) 1548 + push_count--; 1563 1549 continue; 1564 - mptcp_push_release(ssk, &info); 1565 - goto out; 1550 + } 1551 + do_check_data_fin = true; 1566 1552 } 1567 - 1568 - do_check_data_fin = true; 1569 - info.sent += ret; 1570 - len -= ret; 1571 - 1572 - mptcp_update_post_push(msk, dfrag, ret); 1573 1553 } 1574 - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1575 1554 } 1576 1555 1577 1556 /* at this point we held the socket lock for the last subflow we used */ 1578 1557 if (ssk) 1579 1558 mptcp_push_release(ssk, &info); 1580 1559 1581 - out: 1582 1560 /* ensure the rtx timer is running */ 1583 1561 if (!mptcp_timer_pending(sk)) 1584 1562 mptcp_reset_timer(sk); ··· 1592 1570 struct mptcp_sendmsg_info info = { 1593 1571 .data_lock_held = true, 1594 1572 }; 1595 - struct mptcp_data_frag *dfrag; 1573 + bool keep_pushing = true; 1596 1574 struct sock *xmit_ssk; 1597 - int len, copied = 0; 1575 + int copied = 0; 1598 1576 1599 1577 info.flags = 0; 1600 - while ((dfrag = mptcp_send_head(sk))) { 1601 - info.sent = dfrag->already_sent; 1602 - info.limit = dfrag->data_len; 1603 - len = dfrag->data_len - dfrag->already_sent; 1604 - while (len > 0) { 1605 - int ret = 0; 1578 + while (mptcp_send_head(sk) && keep_pushing) { 1579 + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1580 + int ret = 0; 1606 1581 1607 - /* check for a different subflow usage only after 1608 - * spooling the first chunk of data 1609 - */ 1610 - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); 1611 - if (!xmit_ssk) 1612 - goto out; 1613 - if (xmit_ssk != ssk) { 1614 - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), 1615 - MPTCP_DELEGATE_SEND); 1616 - goto out; 1617 - } 1618 - 1619 - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 1620 - if (ret <= 0) 1621 - goto out; 1622 - 1623 - info.sent += ret; 1624 - copied += ret; 1625 - len -= ret; 1582 + /* check for a different subflow usage only after 1583 + * spooling the first chunk of data 1584 + */ 1585 + if (first) { 1586 + mptcp_subflow_set_scheduled(subflow, false); 1587 + ret = __subflow_push_pending(sk, ssk, &info); 1626 1588 first = false; 1627 - 1628 - mptcp_update_post_push(msk, dfrag, ret); 1589 + if (ret <= 0) 1590 + break; 1591 + copied += ret; 1592 + continue; 1629 1593 } 1630 - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); 1594 + 1595 + if (mptcp_sched_get_send(msk)) 1596 + goto out; 1597 + 1598 + if (READ_ONCE(subflow->scheduled)) { 1599 + mptcp_subflow_set_scheduled(subflow, false); 1600 + ret = __subflow_push_pending(sk, ssk, &info); 1601 + if (ret <= 0) 1602 + keep_pushing = false; 1603 + copied += ret; 1604 + } 1605 + 1606 + mptcp_for_each_subflow(msk, subflow) { 1607 + if (READ_ONCE(subflow->scheduled)) { 1608 + xmit_ssk = mptcp_subflow_tcp_sock(subflow); 1609 + if (xmit_ssk != ssk) { 1610 + mptcp_subflow_delegate(subflow, 1611 + MPTCP_DELEGATE_SEND); 1612 + keep_pushing = false; 1613 + } 1614 + } 1615 + } 1631 1616 } 1632 1617 1633 1618 out: ··· 2227 2198 * 2228 2199 * A backup subflow is returned only if that is the only kind available. 2229 2200 */ 2230 - static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) 2201 + struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) 2231 2202 { 2232 2203 struct sock *backup = NULL, *pick = NULL; 2233 2204 struct mptcp_subflow_context *subflow; 2234 2205 int min_stale_count = INT_MAX; 2235 - 2236 - msk_owned_by_me(msk); 2237 - 2238 - if (__mptcp_check_fallback(msk)) 2239 - return NULL; 2240 2206 2241 2207 mptcp_for_each_subflow(msk, subflow) { 2242 2208 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); ··· 2394 2370 WRITE_ONCE(msk->first, NULL); 2395 2371 2396 2372 out: 2397 - if (ssk == msk->last_snd) 2398 - msk->last_snd = NULL; 2399 - 2400 2373 if (need_push) 2401 2374 __mptcp_push_pending(sk, 0); 2402 2375 } ··· 2510 2489 static void __mptcp_retrans(struct sock *sk) 2511 2490 { 2512 2491 struct mptcp_sock *msk = mptcp_sk(sk); 2492 + struct mptcp_subflow_context *subflow; 2513 2493 struct mptcp_sendmsg_info info = {}; 2514 2494 struct mptcp_data_frag *dfrag; 2515 - size_t copied = 0; 2516 2495 struct sock *ssk; 2517 - int ret; 2496 + int ret, err; 2497 + u16 len = 0; 2518 2498 2519 2499 mptcp_clean_una_wakeup(sk); 2520 2500 2521 2501 /* first check ssk: need to kick "stale" logic */ 2522 - ssk = mptcp_subflow_get_retrans(msk); 2502 + err = mptcp_sched_get_retrans(msk); 2523 2503 dfrag = mptcp_rtx_head(sk); 2524 2504 if (!dfrag) { 2525 2505 if (mptcp_data_fin_enabled(msk)) { ··· 2539 2517 goto reset_timer; 2540 2518 } 2541 2519 2542 - if (!ssk) 2520 + if (err) 2543 2521 goto reset_timer; 2544 2522 2545 - lock_sock(ssk); 2523 + mptcp_for_each_subflow(msk, subflow) { 2524 + if (READ_ONCE(subflow->scheduled)) { 2525 + u16 copied = 0; 2546 2526 2547 - /* limit retransmission to the bytes already sent on some subflows */ 2548 - info.sent = 0; 2549 - info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; 2550 - while (info.sent < info.limit) { 2551 - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 2552 - if (ret <= 0) 2553 - break; 2527 + mptcp_subflow_set_scheduled(subflow, false); 2554 2528 2555 - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 2556 - copied += ret; 2557 - info.sent += ret; 2529 + ssk = mptcp_subflow_tcp_sock(subflow); 2530 + 2531 + lock_sock(ssk); 2532 + 2533 + /* limit retransmission to the bytes already sent on some subflows */ 2534 + info.sent = 0; 2535 + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : 2536 + dfrag->already_sent; 2537 + while (info.sent < info.limit) { 2538 + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 2539 + if (ret <= 0) 2540 + break; 2541 + 2542 + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); 2543 + copied += ret; 2544 + info.sent += ret; 2545 + } 2546 + if (copied) { 2547 + len = max(copied, len); 2548 + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 2549 + info.size_goal); 2550 + WRITE_ONCE(msk->allow_infinite_fallback, false); 2551 + } 2552 + 2553 + release_sock(ssk); 2554 + } 2558 2555 } 2559 - if (copied) { 2560 - dfrag->already_sent = max(dfrag->already_sent, info.sent); 2561 - msk->bytes_retrans += copied; 2562 - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 2563 - info.size_goal); 2564 - WRITE_ONCE(msk->allow_infinite_fallback, false); 2565 - } 2566 2556 2567 - release_sock(ssk); 2557 + msk->bytes_retrans += len; 2558 + dfrag->already_sent = max(dfrag->already_sent, len); 2568 2559 2569 2560 reset_timer: 2570 2561 mptcp_check_and_set_pending(sk); ··· 2729 2694 static int mptcp_init_sock(struct sock *sk) 2730 2695 { 2731 2696 struct net *net = sock_net(sk); 2697 + int ret; 2732 2698 2733 2699 __mptcp_init_sock(sk); 2734 2700 ··· 2738 2702 2739 2703 if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) 2740 2704 return -ENOMEM; 2705 + 2706 + ret = mptcp_init_sched(mptcp_sk(sk), 2707 + mptcp_sched_find(mptcp_get_scheduler(net))); 2708 + if (ret) 2709 + return ret; 2741 2710 2742 2711 set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); 2743 2712 ··· 2889 2848 mptcp_stop_timer(sk); 2890 2849 sk_stop_timer(sk, &sk->sk_timer); 2891 2850 msk->pm.status = 0; 2851 + mptcp_release_sched(msk); 2892 2852 2893 2853 sk->sk_prot->destroy(sk); 2894 2854 ··· 3079 3037 * subflow 3080 3038 */ 3081 3039 mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); 3082 - msk->last_snd = NULL; 3083 3040 WRITE_ONCE(msk->flags, 0); 3084 3041 msk->cb_flags = 0; 3085 3042 msk->push_pending = 0; ··· 3144 3103 msk->snd_una = msk->write_seq; 3145 3104 msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; 3146 3105 msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; 3106 + mptcp_init_sched(msk, mptcp_sk(sk)->sched); 3147 3107 3148 3108 /* passive msk is created after the first/MPC subflow */ 3149 3109 msk->subflow_id = 2; ··· 3349 3307 __mptcp_set_connected(sk); 3350 3308 if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) 3351 3309 __mptcp_error_report(sk); 3352 - if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) 3353 - msk->last_snd = NULL; 3354 3310 } 3355 3311 3356 3312 __mptcp_update_rmem(sk); ··· 3965 3925 3966 3926 mptcp_subflow_init(); 3967 3927 mptcp_pm_init(); 3928 + mptcp_sched_init(); 3968 3929 mptcp_token_init(); 3969 3930 3970 3931 if (proto_register(&mptcp_prot, 1) != 0)
+16 -2
net/mptcp/protocol.h
··· 123 123 #define MPTCP_RETRANSMIT 4 124 124 #define MPTCP_FLUSH_JOIN_LIST 5 125 125 #define MPTCP_CONNECTED 6 126 - #define MPTCP_RESET_SCHEDULER 7 127 126 128 127 struct mptcp_skb_cb { 129 128 u64 map_seq; ··· 268 269 u64 rcv_data_fin_seq; 269 270 u64 bytes_retrans; 270 271 int rmem_fwd_alloc; 271 - struct sock *last_snd; 272 272 int snd_burst; 273 273 int old_wspace; 274 274 u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; ··· 312 314 * lock as such sock is freed after close(). 313 315 */ 314 316 struct mptcp_pm_data pm; 317 + struct mptcp_sched_ops *sched; 315 318 struct { 316 319 u32 space; /* bytes copied in last measurement window */ 317 320 u32 copied; /* bytes copied in this measurement window */ ··· 491 492 is_mptfo : 1, /* subflow is doing TFO */ 492 493 __unused : 9; 493 494 enum mptcp_data_avail data_avail; 495 + bool scheduled; 494 496 u32 remote_nonce; 495 497 u64 thmac; 496 498 u32 local_nonce; ··· 625 625 int mptcp_allow_join_id0(const struct net *net); 626 626 unsigned int mptcp_stale_loss_cnt(const struct net *net); 627 627 int mptcp_get_pm_type(const struct net *net); 628 + const char *mptcp_get_scheduler(const struct net *net); 628 629 void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, 629 630 const struct mptcp_options_received *mp_opt); 630 631 bool __mptcp_retransmit_pending_data(struct sock *sk); ··· 658 657 void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 659 658 struct sockaddr_storage *addr, 660 659 unsigned short family); 660 + struct mptcp_sched_ops *mptcp_sched_find(const char *name); 661 + int mptcp_register_scheduler(struct mptcp_sched_ops *sched); 662 + void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); 663 + void mptcp_sched_init(void); 664 + int mptcp_init_sched(struct mptcp_sock *msk, 665 + struct mptcp_sched_ops *sched); 666 + void mptcp_release_sched(struct mptcp_sock *msk); 667 + void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, 668 + bool scheduled); 669 + struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); 670 + struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); 671 + int mptcp_sched_get_send(struct mptcp_sock *msk); 672 + int mptcp_sched_get_retrans(struct mptcp_sock *msk); 661 673 662 674 static inline bool __tcp_can_send(const struct sock *ssk) 663 675 {
+173
net/mptcp/sched.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Multipath TCP 3 + * 4 + * Copyright (c) 2022, SUSE. 5 + */ 6 + 7 + #define pr_fmt(fmt) "MPTCP: " fmt 8 + 9 + #include <linux/kernel.h> 10 + #include <linux/module.h> 11 + #include <linux/list.h> 12 + #include <linux/rculist.h> 13 + #include <linux/spinlock.h> 14 + #include "protocol.h" 15 + 16 + static DEFINE_SPINLOCK(mptcp_sched_list_lock); 17 + static LIST_HEAD(mptcp_sched_list); 18 + 19 + static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, 20 + struct mptcp_sched_data *data) 21 + { 22 + struct sock *ssk; 23 + 24 + ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : 25 + mptcp_subflow_get_send(msk); 26 + if (!ssk) 27 + return -EINVAL; 28 + 29 + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); 30 + return 0; 31 + } 32 + 33 + static struct mptcp_sched_ops mptcp_sched_default = { 34 + .get_subflow = mptcp_sched_default_get_subflow, 35 + .name = "default", 36 + .owner = THIS_MODULE, 37 + }; 38 + 39 + /* Must be called with rcu read lock held */ 40 + struct mptcp_sched_ops *mptcp_sched_find(const char *name) 41 + { 42 + struct mptcp_sched_ops *sched, *ret = NULL; 43 + 44 + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { 45 + if (!strcmp(sched->name, name)) { 46 + ret = sched; 47 + break; 48 + } 49 + } 50 + 51 + return ret; 52 + } 53 + 54 + int mptcp_register_scheduler(struct mptcp_sched_ops *sched) 55 + { 56 + if (!sched->get_subflow) 57 + return -EINVAL; 58 + 59 + spin_lock(&mptcp_sched_list_lock); 60 + if (mptcp_sched_find(sched->name)) { 61 + spin_unlock(&mptcp_sched_list_lock); 62 + return -EEXIST; 63 + } 64 + list_add_tail_rcu(&sched->list, &mptcp_sched_list); 65 + spin_unlock(&mptcp_sched_list_lock); 66 + 67 + pr_debug("%s registered", sched->name); 68 + return 0; 69 + } 70 + 71 + void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) 72 + { 73 + if (sched == &mptcp_sched_default) 74 + return; 75 + 76 + spin_lock(&mptcp_sched_list_lock); 77 + list_del_rcu(&sched->list); 78 + spin_unlock(&mptcp_sched_list_lock); 79 + } 80 + 81 + void mptcp_sched_init(void) 82 + { 83 + mptcp_register_scheduler(&mptcp_sched_default); 84 + } 85 + 86 + int mptcp_init_sched(struct mptcp_sock *msk, 87 + struct mptcp_sched_ops *sched) 88 + { 89 + if (!sched) 90 + sched = &mptcp_sched_default; 91 + 92 + if (!bpf_try_module_get(sched, sched->owner)) 93 + return -EBUSY; 94 + 95 + msk->sched = sched; 96 + if (msk->sched->init) 97 + msk->sched->init(msk); 98 + 99 + pr_debug("sched=%s", msk->sched->name); 100 + 101 + return 0; 102 + } 103 + 104 + void mptcp_release_sched(struct mptcp_sock *msk) 105 + { 106 + struct mptcp_sched_ops *sched = msk->sched; 107 + 108 + if (!sched) 109 + return; 110 + 111 + msk->sched = NULL; 112 + if (sched->release) 113 + sched->release(msk); 114 + 115 + bpf_module_put(sched, sched->owner); 116 + } 117 + 118 + void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, 119 + bool scheduled) 120 + { 121 + WRITE_ONCE(subflow->scheduled, scheduled); 122 + } 123 + 124 + int mptcp_sched_get_send(struct mptcp_sock *msk) 125 + { 126 + struct mptcp_subflow_context *subflow; 127 + struct mptcp_sched_data data; 128 + 129 + msk_owned_by_me(msk); 130 + 131 + /* the following check is moved out of mptcp_subflow_get_send */ 132 + if (__mptcp_check_fallback(msk)) { 133 + if (msk->first && 134 + __tcp_can_send(msk->first) && 135 + sk_stream_memory_free(msk->first)) { 136 + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); 137 + return 0; 138 + } 139 + return -EINVAL; 140 + } 141 + 142 + mptcp_for_each_subflow(msk, subflow) { 143 + if (READ_ONCE(subflow->scheduled)) 144 + return 0; 145 + } 146 + 147 + data.reinject = false; 148 + if (msk->sched == &mptcp_sched_default || !msk->sched) 149 + return mptcp_sched_default_get_subflow(msk, &data); 150 + return msk->sched->get_subflow(msk, &data); 151 + } 152 + 153 + int mptcp_sched_get_retrans(struct mptcp_sock *msk) 154 + { 155 + struct mptcp_subflow_context *subflow; 156 + struct mptcp_sched_data data; 157 + 158 + msk_owned_by_me(msk); 159 + 160 + /* the following check is moved out of mptcp_subflow_get_retrans */ 161 + if (__mptcp_check_fallback(msk)) 162 + return -EINVAL; 163 + 164 + mptcp_for_each_subflow(msk, subflow) { 165 + if (READ_ONCE(subflow->scheduled)) 166 + return 0; 167 + } 168 + 169 + data.reinject = true; 170 + if (msk->sched == &mptcp_sched_default || !msk->sched) 171 + return mptcp_sched_default_get_subflow(msk, &data); 172 + return msk->sched->get_subflow(msk, &data); 173 + }