Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'tcp-provide-better-locality-for-retransmit-timer'

Eric Dumazet says:

====================
tcp: provide better locality for retransmit timer

TCP stack uses three timers per flow, currently spread this way:

- sk->sk_timer : keepalive timer
- icsk->icsk_retransmit_timer : retransmit timer
- icsk->icsk_delack_timer : delayed ack timer

This series moves the retransmit timer to sk->sk_timer location,
to increase data locality in TX paths.

keepalive timers are not often used, this change should be neutral for them.

After the series we have following fields:

- sk->tcp_retransmit_timer : retransmit timer, in sock_write_tx group
- icsk->icsk_delack_timer : delayed ack timer
- icsk->icsk_keepalive_timer : keepalive timer

Moving icsk_delack_timer in a beter location would also be welcomed.
====================

Link: https://patch.msgid.link/20251124175013.1473655-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+74 -69
+1 -1
Documentation/networking/net_cachelines/inet_connection_sock.rst
··· 12 12 struct request_sock_queue icsk_accept_queue 13 13 struct inet_bind_bucket icsk_bind_hash read_mostly tcp_set_state 14 14 struct inet_bind2_bucket icsk_bind2_hash read_mostly tcp_set_state,inet_put_port 15 - struct timer_list icsk_retransmit_timer read_write inet_csk_reset_xmit_timer,tcp_connect 16 15 struct timer_list icsk_delack_timer read_mostly inet_csk_reset_xmit_timer,tcp_connect 16 + struct timer_list icsk_keepalive_timer 17 17 u32 icsk_rto read_write tcp_cwnd_validate,tcp_schedule_loss_probe,tcp_connect_init,tcp_connect,tcp_write_xmit,tcp_push_one 18 18 u32 icsk_rto_min 19 19 u32 icsk_rto_max read_mostly tcp_reset_xmit_timer
+12 -8
include/net/inet_connection_sock.h
··· 56 56 * @icsk_accept_queue: FIFO of established children 57 57 * @icsk_bind_hash: Bind node 58 58 * @icsk_bind2_hash: Bind node in the bhash2 table 59 - * @icsk_retransmit_timer: Resend (no ack) 59 + * @icsk_delack_timer: Delayed ACK timer 60 + * @icsk_keepalive_timer: Keepalive timer 61 + * @mptcp_tout_timer: mptcp timer 60 62 * @icsk_rto: Retransmit timeout 61 63 * @icsk_pmtu_cookie Last pmtu seen by socket 62 64 * @icsk_ca_ops Pluggable congestion control hook ··· 83 81 struct request_sock_queue icsk_accept_queue; 84 82 struct inet_bind_bucket *icsk_bind_hash; 85 83 struct inet_bind2_bucket *icsk_bind2_hash; 86 - struct timer_list icsk_retransmit_timer; 87 - struct timer_list icsk_delack_timer; 84 + struct timer_list icsk_delack_timer; 85 + union { 86 + struct timer_list icsk_keepalive_timer; 87 + struct timer_list mptcp_tout_timer; 88 + }; 88 89 __u32 icsk_rto; 89 90 __u32 icsk_rto_min; 90 91 u32 icsk_rto_max; ··· 189 184 memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); 190 185 } 191 186 192 - static inline unsigned long 193 - icsk_timeout(const struct inet_connection_sock *icsk) 187 + static inline unsigned long tcp_timeout_expires(const struct sock *sk) 194 188 { 195 - return READ_ONCE(icsk->icsk_retransmit_timer.expires); 189 + return READ_ONCE(sk->tcp_retransmit_timer.expires); 196 190 } 197 191 198 192 static inline unsigned long ··· 207 203 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { 208 204 smp_store_release(&icsk->icsk_pending, 0); 209 205 #ifdef INET_CSK_CLEAR_TIMERS 210 - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 206 + sk_stop_timer(sk, &sk->tcp_retransmit_timer); 211 207 #endif 212 208 } else if (what == ICSK_TIME_DACK) { 213 209 smp_store_release(&icsk->icsk_ack.pending, 0); ··· 239 235 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || 240 236 what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { 241 237 smp_store_release(&icsk->icsk_pending, what); 242 - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when); 238 + sk_reset_timer(sk, &sk->tcp_retransmit_timer, when); 243 239 } else if (what == ICSK_TIME_DACK) { 244 240 smp_store_release(&icsk->icsk_ack.pending, 245 241 icsk->icsk_ack.pending | ICSK_ACK_TIMER);
+9 -4
include/net/sock.h
··· 305 305 * @sk_txrehash: enable TX hash rethink 306 306 * @sk_filter: socket filtering instructions 307 307 * @sk_timer: sock cleanup timer 308 + * @tcp_retransmit_timer: tcp retransmit timer 309 + * @mptcp_retransmit_timer: mptcp retransmit timer 308 310 * @sk_stamp: time stamp of last packet received 309 311 * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only 310 312 * @sk_tsflags: SO_TIMESTAMPING flags ··· 483 481 struct rb_root tcp_rtx_queue; 484 482 }; 485 483 struct sk_buff_head sk_write_queue; 486 - u32 sk_dst_pending_confirm; 487 - u32 sk_pacing_status; /* see enum sk_pacing */ 488 484 struct page_frag sk_frag; 489 - struct timer_list sk_timer; 490 - 485 + union { 486 + struct timer_list sk_timer; 487 + struct timer_list tcp_retransmit_timer; 488 + struct timer_list mptcp_retransmit_timer; 489 + }; 491 490 unsigned long sk_pacing_rate; /* bytes per second */ 492 491 atomic_t sk_zckey; 493 492 atomic_t sk_tskey; ··· 496 493 __cacheline_group_end(sock_write_tx); 497 494 498 495 __cacheline_group_begin(sock_read_tx); 496 + u32 sk_dst_pending_confirm; 497 + u32 sk_pacing_status; /* see enum sk_pacing */ 499 498 unsigned long sk_max_pacing_rate; 500 499 long sk_sndtimeo; 501 500 u32 sk_priority;
+2 -2
net/core/sock.c
··· 4519 4519 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4520 4520 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4521 4521 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4522 - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); 4523 - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); 4524 4522 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4525 4523 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4526 4524 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4527 4525 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4528 4526 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4529 4527 4528 + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); 4529 + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); 4530 4530 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4531 4531 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4532 4532 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
+6 -6
net/ipv4/inet_connection_sock.c
··· 737 737 { 738 738 struct inet_connection_sock *icsk = inet_csk(sk); 739 739 740 - timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); 740 + timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0); 741 741 timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); 742 - timer_setup(&sk->sk_timer, keepalive_handler, 0); 742 + timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0); 743 743 icsk->icsk_pending = icsk->icsk_ack.pending = 0; 744 744 } 745 745 ··· 750 750 smp_store_release(&icsk->icsk_pending, 0); 751 751 smp_store_release(&icsk->icsk_ack.pending, 0); 752 752 753 - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 753 + sk_stop_timer(sk, &sk->tcp_retransmit_timer); 754 754 sk_stop_timer(sk, &icsk->icsk_delack_timer); 755 - sk_stop_timer(sk, &sk->sk_timer); 755 + sk_stop_timer(sk, &icsk->icsk_keepalive_timer); 756 756 } 757 757 758 758 void inet_csk_clear_xmit_timers_sync(struct sock *sk) ··· 765 765 smp_store_release(&icsk->icsk_pending, 0); 766 766 smp_store_release(&icsk->icsk_ack.pending, 0); 767 767 768 - sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); 768 + sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer); 769 769 sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); 770 - sk_stop_timer_sync(sk, &sk->sk_timer); 770 + sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer); 771 771 } 772 772 773 773 struct dst_entry *inet_csk_route_req(const struct sock *sk,
+4 -4
net/ipv4/inet_diag.c
··· 287 287 r->idiag_timer = 1; 288 288 r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); 289 289 r->idiag_expires = 290 - jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); 290 + jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); 291 291 } else if (icsk_pending == ICSK_TIME_PROBE0) { 292 292 r->idiag_timer = 4; 293 293 r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); 294 294 r->idiag_expires = 295 - jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); 296 - } else if (timer_pending(&sk->sk_timer)) { 295 + jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); 296 + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 297 297 r->idiag_timer = 2; 298 298 r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); 299 299 r->idiag_expires = 300 - jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); 300 + jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies); 301 301 } 302 302 303 303 if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
+4 -4
net/ipv4/tcp_ipv4.c
··· 2869 2869 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2870 2870 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2871 2871 timer_active = 1; 2872 - timer_expires = icsk_timeout(icsk); 2872 + timer_expires = tcp_timeout_expires(sk); 2873 2873 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2874 2874 timer_active = 4; 2875 - timer_expires = icsk_timeout(icsk); 2876 - } else if (timer_pending(&sk->sk_timer)) { 2875 + timer_expires = tcp_timeout_expires(sk); 2876 + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2877 2877 timer_active = 2; 2878 - timer_expires = sk->sk_timer.expires; 2878 + timer_expires = icsk->icsk_keepalive_timer.expires; 2879 2879 } else { 2880 2880 timer_active = 0; 2881 2881 timer_expires = jiffies;
+11 -12
net/ipv4/tcp_timer.c
··· 510 510 * and tp->rcv_tstamp might very well have been written recently. 511 511 * rcv_delta can thus be negative. 512 512 */ 513 - rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp; 513 + rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp; 514 514 if (rcv_delta <= timeout) 515 515 return false; 516 516 ··· 697 697 !icsk->icsk_pending) 698 698 return; 699 699 700 - if (time_after(icsk_timeout(icsk), jiffies)) { 701 - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, 702 - icsk_timeout(icsk)); 700 + if (time_after(tcp_timeout_expires(sk), jiffies)) { 701 + sk_reset_timer(sk, &sk->tcp_retransmit_timer, 702 + tcp_timeout_expires(sk)); 703 703 return; 704 704 } 705 705 tcp_mstamp_refresh(tcp_sk(sk)); ··· 725 725 726 726 static void tcp_write_timer(struct timer_list *t) 727 727 { 728 - struct inet_connection_sock *icsk = 729 - timer_container_of(icsk, t, icsk_retransmit_timer); 730 - struct sock *sk = &icsk->icsk_inet.sk; 728 + struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer); 731 729 732 730 /* Avoid locking the socket when there is no pending event. */ 733 - if (!smp_load_acquire(&icsk->icsk_pending)) 731 + if (!smp_load_acquire(&inet_csk(sk)->icsk_pending)) 734 732 goto out; 735 733 736 734 bh_lock_sock(sk); ··· 753 755 754 756 void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) 755 757 { 756 - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); 758 + sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len); 757 759 } 758 760 759 761 static void tcp_delete_keepalive_timer(struct sock *sk) 760 762 { 761 - sk_stop_timer(sk, &sk->sk_timer); 763 + sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer); 762 764 } 763 765 764 766 void tcp_set_keepalive(struct sock *sk, int val) ··· 775 777 776 778 static void tcp_keepalive_timer(struct timer_list *t) 777 779 { 778 - struct sock *sk = timer_container_of(sk, t, sk_timer); 779 - struct inet_connection_sock *icsk = inet_csk(sk); 780 + struct inet_connection_sock *icsk = 781 + timer_container_of(icsk, t, icsk_keepalive_timer); 782 + struct sock *sk = &icsk->icsk_inet.sk; 780 783 struct tcp_sock *tp = tcp_sk(sk); 781 784 u32 elapsed; 782 785
+4 -4
net/ipv6/tcp_ipv6.c
··· 2163 2163 icsk_pending == ICSK_TIME_REO_TIMEOUT || 2164 2164 icsk_pending == ICSK_TIME_LOSS_PROBE) { 2165 2165 timer_active = 1; 2166 - timer_expires = icsk_timeout(icsk); 2166 + timer_expires = tcp_timeout_expires(sp); 2167 2167 } else if (icsk_pending == ICSK_TIME_PROBE0) { 2168 2168 timer_active = 4; 2169 - timer_expires = icsk_timeout(icsk); 2170 - } else if (timer_pending(&sp->sk_timer)) { 2169 + timer_expires = tcp_timeout_expires(sp); 2170 + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 2171 2171 timer_active = 2; 2172 - timer_expires = sp->sk_timer.expires; 2172 + timer_expires = icsk->icsk_keepalive_timer.expires; 2173 2173 } else { 2174 2174 timer_active = 0; 2175 2175 timer_expires = jiffies;
+12 -15
net/mptcp/protocol.c
··· 411 411 412 412 static void mptcp_stop_rtx_timer(struct sock *sk) 413 413 { 414 - struct inet_connection_sock *icsk = inet_csk(sk); 415 - 416 - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); 414 + sk_stop_timer(sk, &sk->mptcp_retransmit_timer); 417 415 mptcp_sk(sk)->timer_ival = 0; 418 416 } 419 417 ··· 517 519 const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 518 520 519 521 return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? 520 - icsk_timeout(inet_csk(ssk)) - jiffies : 0; 522 + tcp_timeout_expires(ssk) - jiffies : 0; 521 523 } 522 524 523 525 static void mptcp_set_timeout(struct sock *sk) ··· 952 954 953 955 static bool mptcp_rtx_timer_pending(struct sock *sk) 954 956 { 955 - return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); 957 + return timer_pending(&sk->mptcp_retransmit_timer); 956 958 } 957 959 958 960 static void mptcp_reset_rtx_timer(struct sock *sk) 959 961 { 960 - struct inet_connection_sock *icsk = inet_csk(sk); 961 962 unsigned long tout; 962 963 963 964 /* prevent rescheduling on close */ ··· 964 967 return; 965 968 966 969 tout = mptcp_sk(sk)->timer_ival; 967 - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); 970 + sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout); 968 971 } 969 972 970 973 bool mptcp_schedule_work(struct sock *sk) ··· 2351 2354 2352 2355 static void mptcp_retransmit_timer(struct timer_list *t) 2353 2356 { 2354 - struct inet_connection_sock *icsk = timer_container_of(icsk, t, 2355 - icsk_retransmit_timer); 2356 - struct sock *sk = &icsk->icsk_inet.sk; 2357 + struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer); 2357 2358 struct mptcp_sock *msk = mptcp_sk(sk); 2358 2359 2359 2360 bh_lock_sock(sk); ··· 2369 2374 2370 2375 static void mptcp_tout_timer(struct timer_list *t) 2371 2376 { 2372 - struct sock *sk = timer_container_of(sk, t, sk_timer); 2377 + struct inet_connection_sock *icsk = 2378 + timer_container_of(icsk, t, mptcp_tout_timer); 2379 + struct sock *sk = &icsk->icsk_inet.sk; 2373 2380 2374 2381 mptcp_schedule_work(sk); 2375 2382 sock_put(sk); ··· 2825 2828 */ 2826 2829 timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; 2827 2830 2828 - sk_reset_timer(sk, &sk->sk_timer, timeout); 2831 + sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout); 2829 2832 } 2830 2833 2831 2834 static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) ··· 2970 2973 spin_lock_init(&msk->fallback_lock); 2971 2974 2972 2975 /* re-use the csk retrans timer for MPTCP-level retrans */ 2973 - timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); 2974 - timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); 2976 + timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0); 2977 + timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0); 2975 2978 } 2976 2979 2977 2980 static void mptcp_ca_reset(struct sock *sk) ··· 3173 3176 might_sleep(); 3174 3177 3175 3178 mptcp_stop_rtx_timer(sk); 3176 - sk_stop_timer(sk, &sk->sk_timer); 3179 + sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); 3177 3180 msk->pm.status = 0; 3178 3181 mptcp_release_sched(msk); 3179 3182
+1 -1
net/mptcp/protocol.h
··· 892 892 if (!inet_csk(sk)->icsk_mtup.probe_timestamp) 893 893 return; 894 894 895 - sk_stop_timer(sk, &sk->sk_timer); 895 + sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); 896 896 inet_csk(sk)->icsk_mtup.probe_timestamp = 0; 897 897 } 898 898
+4 -4
tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
··· 99 99 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 100 100 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 101 101 timer_active = 1; 102 - timer_expires = icsk->icsk_retransmit_timer.expires; 102 + timer_expires = sp->tcp_retransmit_timer.expires; 103 103 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 104 104 timer_active = 4; 105 - timer_expires = icsk->icsk_retransmit_timer.expires; 106 - } else if (timer_pending(&sp->sk_timer)) { 105 + timer_expires = sp->tcp_retransmit_timer.expires; 106 + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 107 107 timer_active = 2; 108 - timer_expires = sp->sk_timer.expires; 108 + timer_expires = icsk->icsk_keepalive_timer.expires; 109 109 } else { 110 110 timer_active = 0; 111 111 timer_expires = bpf_jiffies64();
+4 -4
tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
··· 99 99 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 100 100 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 101 101 timer_active = 1; 102 - timer_expires = icsk->icsk_retransmit_timer.expires; 102 + timer_expires = sp->tcp_retransmit_timer.expires; 103 103 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 104 104 timer_active = 4; 105 - timer_expires = icsk->icsk_retransmit_timer.expires; 106 - } else if (timer_pending(&sp->sk_timer)) { 105 + timer_expires = sp->tcp_retransmit_timer.expires; 106 + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { 107 107 timer_active = 2; 108 - timer_expires = sp->sk_timer.expires; 108 + timer_expires = icsk->icsk_keepalive_timer.expires; 109 109 } else { 110 110 timer_active = 0; 111 111 timer_expires = bpf_jiffies64();