Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mptcp-fix-fallback-related-races'

Matthieu Baerts says:

====================
mptcp: fix fallback-related races

This series contains 3 fixes somewhat related to various races we have
while handling fallback.

The root cause of the issues addressed here is that the check for
"we can fallback to tcp now" and the related action are not atomic. That
also applies to fallback due to MP_FAIL -- where the window race is even
wider.

Address the issue introducing an additional spinlock to bundle together
all the relevant events, as per patch 1 and 2. These fixes can be
backported up to v5.19 and v5.15.

Note that mptcp_disconnect() unconditionally clears the fallback status
(zeroing msk->flags) but don't touch the `allows_infinite_fallback`
flag. Such issue is addressed in patch 3, and can be backported up to
v5.17.
====================

Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-0-391aff963322@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+98 -28
+2 -1
net/mptcp/options.c
··· 978 978 if (subflow->mp_join) 979 979 goto reset; 980 980 subflow->mp_capable = 0; 981 + if (!mptcp_try_fallback(ssk)) 982 + goto reset; 981 983 pr_fallback(msk); 982 - mptcp_do_fallback(ssk); 983 984 return false; 984 985 } 985 986
+7 -1
net/mptcp/pm.c
··· 765 765 766 766 pr_debug("fail_seq=%llu\n", fail_seq); 767 767 768 - if (!READ_ONCE(msk->allow_infinite_fallback)) 768 + /* After accepting the fail, we can't create any other subflows */ 769 + spin_lock_bh(&msk->fallback_lock); 770 + if (!msk->allow_infinite_fallback) { 771 + spin_unlock_bh(&msk->fallback_lock); 769 772 return; 773 + } 774 + msk->allow_subflows = false; 775 + spin_unlock_bh(&msk->fallback_lock); 770 776 771 777 if (!subflow->fail_tout) { 772 778 pr_debug("send MP_FAIL response and infinite map\n");
+48 -8
net/mptcp/protocol.c
··· 560 560 561 561 static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) 562 562 { 563 - if (READ_ONCE(msk->allow_infinite_fallback)) { 563 + if (mptcp_try_fallback(ssk)) { 564 564 MPTCP_INC_STATS(sock_net(ssk), 565 565 MPTCP_MIB_DSSCORRUPTIONFALLBACK); 566 - mptcp_do_fallback(ssk); 567 566 } else { 568 567 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); 569 568 mptcp_subflow_reset(ssk); ··· 791 792 static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) 792 793 { 793 794 mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); 794 - WRITE_ONCE(msk->allow_infinite_fallback, false); 795 + msk->allow_infinite_fallback = false; 795 796 mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); 796 797 } 797 798 ··· 802 803 if (sk->sk_state != TCP_ESTABLISHED) 803 804 return false; 804 805 806 + spin_lock_bh(&msk->fallback_lock); 807 + if (!msk->allow_subflows) { 808 + spin_unlock_bh(&msk->fallback_lock); 809 + return false; 810 + } 811 + mptcp_subflow_joined(msk, ssk); 812 + spin_unlock_bh(&msk->fallback_lock); 813 + 805 814 /* attach to msk socket only after we are sure we will deal with it 806 815 * at close time 807 816 */ ··· 818 811 819 812 mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; 820 813 mptcp_sockopt_sync_locked(msk, ssk); 821 - mptcp_subflow_joined(msk, ssk); 822 814 mptcp_stop_tout_timer(sk); 823 815 __mptcp_propagate_sndbuf(sk, ssk); 824 816 return true; ··· 1142 1136 mpext->infinite_map = 1; 1143 1137 mpext->data_len = 0; 1144 1138 1139 + if (!mptcp_try_fallback(ssk)) { 1140 + mptcp_subflow_reset(ssk); 1141 + return; 1142 + } 1143 + 1145 1144 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); 1146 1145 mptcp_subflow_ctx(ssk)->send_infinite_map = 0; 1147 1146 pr_fallback(msk); 1148 - mptcp_do_fallback(ssk); 1149 1147 } 1150 1148 1151 1149 #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) ··· 2553 2543 2554 2544 static void __mptcp_retrans(struct sock *sk) 2555 2545 { 2546 + struct mptcp_sendmsg_info info = { .data_lock_held = true, }; 2556 2547 struct mptcp_sock *msk = mptcp_sk(sk); 2557 2548 struct mptcp_subflow_context *subflow; 2558 - struct mptcp_sendmsg_info info = {}; 2559 2549 struct mptcp_data_frag *dfrag; 2560 2550 struct sock *ssk; 2561 2551 int ret, err; ··· 2600 2590 info.sent = 0; 2601 2591 info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : 2602 2592 dfrag->already_sent; 2593 + 2594 + /* 2595 + * make the whole retrans decision, xmit, disallow 2596 + * fallback atomic 2597 + */ 2598 + spin_lock_bh(&msk->fallback_lock); 2599 + if (__mptcp_check_fallback(msk)) { 2600 + spin_unlock_bh(&msk->fallback_lock); 2601 + release_sock(ssk); 2602 + return; 2603 + } 2604 + 2603 2605 while (info.sent < info.limit) { 2604 2606 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); 2605 2607 if (ret <= 0) ··· 2625 2603 len = max(copied, len); 2626 2604 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, 2627 2605 info.size_goal); 2628 - WRITE_ONCE(msk->allow_infinite_fallback, false); 2606 + msk->allow_infinite_fallback = false; 2629 2607 } 2608 + spin_unlock_bh(&msk->fallback_lock); 2630 2609 2631 2610 release_sock(ssk); 2632 2611 } ··· 2753 2730 WRITE_ONCE(msk->first, NULL); 2754 2731 inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; 2755 2732 WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); 2756 - WRITE_ONCE(msk->allow_infinite_fallback, true); 2733 + msk->allow_infinite_fallback = true; 2734 + msk->allow_subflows = true; 2757 2735 msk->recovery = false; 2758 2736 msk->subflow_id = 1; 2759 2737 msk->last_data_sent = tcp_jiffies32; ··· 2762 2738 msk->last_ack_recv = tcp_jiffies32; 2763 2739 2764 2740 mptcp_pm_data_init(msk); 2741 + spin_lock_init(&msk->fallback_lock); 2765 2742 2766 2743 /* re-use the csk retrans timer for MPTCP-level retrans */ 2767 2744 timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); ··· 3142 3117 * subflow 3143 3118 */ 3144 3119 mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); 3120 + 3121 + /* The first subflow is already in TCP_CLOSE status, the following 3122 + * can't overlap with a fallback anymore 3123 + */ 3124 + spin_lock_bh(&msk->fallback_lock); 3125 + msk->allow_subflows = true; 3126 + msk->allow_infinite_fallback = true; 3145 3127 WRITE_ONCE(msk->flags, 0); 3128 + spin_unlock_bh(&msk->fallback_lock); 3129 + 3146 3130 msk->cb_flags = 0; 3147 3131 msk->recovery = false; 3148 3132 WRITE_ONCE(msk->can_ack, false); ··· 3558 3524 3559 3525 /* active subflow, already present inside the conn_list */ 3560 3526 if (!list_empty(&subflow->node)) { 3527 + spin_lock_bh(&msk->fallback_lock); 3528 + if (!msk->allow_subflows) { 3529 + spin_unlock_bh(&msk->fallback_lock); 3530 + return false; 3531 + } 3561 3532 mptcp_subflow_joined(msk, ssk); 3533 + spin_unlock_bh(&msk->fallback_lock); 3562 3534 mptcp_propagate_sndbuf(parent, ssk); 3563 3535 return true; 3564 3536 }
+22 -7
net/mptcp/protocol.h
··· 346 346 u64 rtt_us; /* last maximum rtt of subflows */ 347 347 } rcvq_space; 348 348 u8 scaling_ratio; 349 + bool allow_subflows; 349 350 350 351 u32 subflow_id; 351 352 u32 setsockopt_seq; 352 353 char ca_name[TCP_CA_NAME_MAX]; 354 + 355 + spinlock_t fallback_lock; /* protects fallback, 356 + * allow_infinite_fallback and 357 + * allow_join 358 + */ 353 359 }; 354 360 355 361 #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) ··· 1222 1216 return __mptcp_check_fallback(msk); 1223 1217 } 1224 1218 1225 - static inline void __mptcp_do_fallback(struct mptcp_sock *msk) 1219 + static inline bool __mptcp_try_fallback(struct mptcp_sock *msk) 1226 1220 { 1227 1221 if (__mptcp_check_fallback(msk)) { 1228 1222 pr_debug("TCP fallback already done (msk=%p)\n", msk); 1229 - return; 1223 + return true; 1230 1224 } 1231 - if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) 1232 - return; 1225 + spin_lock_bh(&msk->fallback_lock); 1226 + if (!msk->allow_infinite_fallback) { 1227 + spin_unlock_bh(&msk->fallback_lock); 1228 + return false; 1229 + } 1230 + 1231 + msk->allow_subflows = false; 1233 1232 set_bit(MPTCP_FALLBACK_DONE, &msk->flags); 1233 + spin_unlock_bh(&msk->fallback_lock); 1234 + return true; 1234 1235 } 1235 1236 1236 1237 static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) ··· 1249 1236 TCPF_SYN_RECV | TCPF_LISTEN)); 1250 1237 } 1251 1238 1252 - static inline void mptcp_do_fallback(struct sock *ssk) 1239 + static inline bool mptcp_try_fallback(struct sock *ssk) 1253 1240 { 1254 1241 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1255 1242 struct sock *sk = subflow->conn; 1256 1243 struct mptcp_sock *msk; 1257 1244 1258 1245 msk = mptcp_sk(sk); 1259 - __mptcp_do_fallback(msk); 1246 + if (!__mptcp_try_fallback(msk)) 1247 + return false; 1260 1248 if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { 1261 1249 gfp_t saved_allocation = ssk->sk_allocation; 1262 1250 ··· 1269 1255 tcp_shutdown(ssk, SEND_SHUTDOWN); 1270 1256 ssk->sk_allocation = saved_allocation; 1271 1257 } 1258 + return true; 1272 1259 } 1273 1260 1274 1261 #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a) ··· 1279 1264 { 1280 1265 pr_fallback(msk); 1281 1266 subflow->request_mptcp = 0; 1282 - __mptcp_do_fallback(msk); 1267 + WARN_ON_ONCE(!__mptcp_try_fallback(msk)); 1283 1268 } 1284 1269 1285 1270 static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
+19 -11
net/mptcp/subflow.c
··· 544 544 mptcp_get_options(skb, &mp_opt); 545 545 if (subflow->request_mptcp) { 546 546 if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { 547 + if (!mptcp_try_fallback(sk)) 548 + goto do_reset; 549 + 547 550 MPTCP_INC_STATS(sock_net(sk), 548 551 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 549 - mptcp_do_fallback(sk); 550 552 pr_fallback(msk); 551 553 goto fallback; 552 554 } ··· 1302 1300 mptcp_schedule_work(sk); 1303 1301 } 1304 1302 1305 - static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) 1303 + static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) 1306 1304 { 1307 1305 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1308 1306 unsigned long fail_tout; 1309 1307 1308 + /* we are really failing, prevent any later subflow join */ 1309 + spin_lock_bh(&msk->fallback_lock); 1310 + if (!msk->allow_infinite_fallback) { 1311 + spin_unlock_bh(&msk->fallback_lock); 1312 + return false; 1313 + } 1314 + msk->allow_subflows = false; 1315 + spin_unlock_bh(&msk->fallback_lock); 1316 + 1310 1317 /* graceful failure can happen only on the MPC subflow */ 1311 1318 if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) 1312 - return; 1319 + return false; 1313 1320 1314 1321 /* since the close timeout take precedence on the fail one, 1315 1322 * no need to start the latter when the first is already set 1316 1323 */ 1317 1324 if (sock_flag((struct sock *)msk, SOCK_DEAD)) 1318 - return; 1325 + return true; 1319 1326 1320 1327 /* we don't need extreme accuracy here, use a zero fail_tout as special 1321 1328 * value meaning no fail timeout at all; ··· 1336 1325 tcp_send_ack(ssk); 1337 1326 1338 1327 mptcp_reset_tout_timer(msk, subflow->fail_tout); 1328 + return true; 1339 1329 } 1340 1330 1341 1331 static bool subflow_check_data_avail(struct sock *ssk) ··· 1397 1385 (subflow->mp_join || subflow->valid_csum_seen)) { 1398 1386 subflow->send_mp_fail = 1; 1399 1387 1400 - if (!READ_ONCE(msk->allow_infinite_fallback)) { 1388 + if (!mptcp_subflow_fail(msk, ssk)) { 1401 1389 subflow->reset_transient = 0; 1402 1390 subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; 1403 1391 goto reset; 1404 1392 } 1405 - mptcp_subflow_fail(msk, ssk); 1406 1393 WRITE_ONCE(subflow->data_avail, true); 1407 1394 return true; 1408 1395 } 1409 1396 1410 - if (!READ_ONCE(msk->allow_infinite_fallback)) { 1397 + if (!mptcp_try_fallback(ssk)) { 1411 1398 /* fatal protocol error, close the socket. 1412 1399 * subflow_error_report() will introduce the appropriate barriers 1413 1400 */ ··· 1424 1413 WRITE_ONCE(subflow->data_avail, false); 1425 1414 return false; 1426 1415 } 1427 - 1428 - mptcp_do_fallback(ssk); 1429 1416 } 1430 1417 1431 1418 skb = skb_peek(&ssk->sk_receive_queue); ··· 1688 1679 /* discard the subflow socket */ 1689 1680 mptcp_sock_graft(ssk, sk->sk_socket); 1690 1681 iput(SOCK_INODE(sf)); 1691 - WRITE_ONCE(msk->allow_infinite_fallback, false); 1692 1682 mptcp_stop_tout_timer(sk); 1693 1683 return 0; 1694 1684 ··· 1859 1851 1860 1852 msk = mptcp_sk(parent); 1861 1853 if (subflow_simultaneous_connect(sk)) { 1862 - mptcp_do_fallback(sk); 1854 + WARN_ON_ONCE(!mptcp_try_fallback(sk)); 1863 1855 pr_fallback(msk); 1864 1856 subflow->conn_finished = 1; 1865 1857 mptcp_propagate_state(parent, sk, subflow, NULL);