Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'tcp-oom-probe'

Menglong Dong says:

====================
net: tcp: support probing OOM

In this series, we make some small changes to make the tcp
retransmission become zero-window probes if the receiver drops the skb
because of memory pressure.

In the 1st patch, we reply a zero-window ACK if the skb is dropped
because out of memory, instead of dropping the skb silently.

In the 2nd patch, we allow a zero-window ACK to update the window.

In the 3rd patch, fix unexcepted socket die when snd_wnd is 0 in
tcp_retransmit_timer().

In the 4th patch, we refactor the debug message in
tcp_retransmit_timer() to make it more correct.

After these changes, the tcp can probe the OOM of the receiver forever.

Changes since v3:
- make the timeout "2 * TCP_RTO_MAX" in the 3rd patch
- tp->retrans_stamp is not based on jiffies and can't be compared with
icsk->icsk_timeout in the 3rd patch. Fix it.
- introduce the 4th patch

Changes since v2:
- refactor the code to avoid code duplication in the 1st patch
- use after() instead of max() in tcp_rtx_probe0_timed_out()

Changes since v1:
- send 0 rwin ACK for the receive queue empty case when necessary in the
1st patch
- send the ACK immediately by using the ICSK_ACK_NOW flag in the 1st
patch
- consider the case of the connection restart from idle, as Neal comment,
in the 3rd patch
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+56 -22
+2 -1
include/net/inet_connection_sock.h
··· 164 164 ICSK_ACK_TIMER = 2, 165 165 ICSK_ACK_PUSHED = 4, 166 166 ICSK_ACK_PUSHED2 = 8, 167 - ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ 167 + ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ 168 + ICSK_ACK_NOMEM = 32, 168 169 }; 169 170 170 171 void inet_csk_init_xmit_timers(struct sock *sk,
+13 -7
net/ipv4/tcp_input.c
··· 3525 3525 { 3526 3526 return after(ack, tp->snd_una) || 3527 3527 after(ack_seq, tp->snd_wl1) || 3528 - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); 3528 + (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); 3529 3529 } 3530 3530 3531 3531 /* If we update tp->snd_una, also update tp->bytes_acked */ ··· 5059 5059 5060 5060 /* Ok. In sequence. In window. */ 5061 5061 queue_and_out: 5062 - if (skb_queue_len(&sk->sk_receive_queue) == 0) 5063 - sk_forced_mem_schedule(sk, skb->truesize); 5064 - else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { 5065 - reason = SKB_DROP_REASON_PROTO_MEM; 5066 - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); 5062 + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { 5063 + /* TODO: maybe ratelimit these WIN 0 ACK ? */ 5064 + inet_csk(sk)->icsk_ack.pending |= 5065 + (ICSK_ACK_NOMEM | ICSK_ACK_NOW); 5066 + inet_csk_schedule_ack(sk); 5067 5067 sk->sk_data_ready(sk); 5068 - goto drop; 5068 + 5069 + if (skb_queue_len(&sk->sk_receive_queue)) { 5070 + reason = SKB_DROP_REASON_PROTO_MEM; 5071 + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); 5072 + goto drop; 5073 + } 5074 + sk_forced_mem_schedule(sk, skb->truesize); 5069 5075 } 5070 5076 5071 5077 eaten = tcp_queue_rcv(sk, skb, &fragstolen);
+11 -3
net/ipv4/tcp_output.c
··· 257 257 static u16 tcp_select_window(struct sock *sk) 258 258 { 259 259 struct tcp_sock *tp = tcp_sk(sk); 260 - u32 old_win = tp->rcv_wnd; 261 - u32 cur_win = tcp_receive_window(tp); 262 - u32 new_win = __tcp_select_window(sk); 263 260 struct net *net = sock_net(sk); 261 + u32 old_win = tp->rcv_wnd; 262 + u32 cur_win, new_win; 264 263 264 + /* Make the window 0 if we failed to queue the data because we 265 + * are out of memory. The window is temporary, so we don't store 266 + * it on the socket. 267 + */ 268 + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) 269 + return 0; 270 + 271 + cur_win = tcp_receive_window(tp); 272 + new_win = __tcp_select_window(sk); 265 273 if (new_win < cur_win) { 266 274 /* Danger Will Robinson! 267 275 * Don't update rcv_wup/rcv_wnd here or else
+30 -11
net/ipv4/tcp_timer.c
··· 454 454 req->timeout << req->num_timeout, TCP_RTO_MAX); 455 455 } 456 456 457 + static bool tcp_rtx_probe0_timed_out(const struct sock *sk, 458 + const struct sk_buff *skb) 459 + { 460 + const struct tcp_sock *tp = tcp_sk(sk); 461 + const int timeout = TCP_RTO_MAX * 2; 462 + u32 rcv_delta, rtx_delta; 463 + 464 + rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; 465 + if (rcv_delta <= timeout) 466 + return false; 467 + 468 + rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) - 469 + (tp->retrans_stamp ?: tcp_skb_timestamp(skb))); 470 + 471 + return rtx_delta > timeout; 472 + } 457 473 458 474 /** 459 475 * tcp_retransmit_timer() - The TCP retransmit timeout handler ··· 519 503 * we cannot allow such beasts to hang infinitely. 520 504 */ 521 505 struct inet_sock *inet = inet_sk(sk); 506 + u32 rtx_delta; 507 + 508 + rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb)); 522 509 if (sk->sk_family == AF_INET) { 523 - net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 524 - &inet->inet_daddr, 525 - ntohs(inet->inet_dport), 526 - inet->inet_num, 527 - tp->snd_una, tp->snd_nxt); 510 + net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", 511 + &inet->inet_daddr, ntohs(inet->inet_dport), 512 + inet->inet_num, tp->snd_una, tp->snd_nxt, 513 + jiffies_to_msecs(jiffies - tp->rcv_tstamp), 514 + rtx_delta); 528 515 } 529 516 #if IS_ENABLED(CONFIG_IPV6) 530 517 else if (sk->sk_family == AF_INET6) { 531 - net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", 532 - &sk->sk_v6_daddr, 533 - ntohs(inet->inet_dport), 534 - inet->inet_num, 535 - tp->snd_una, tp->snd_nxt); 518 + net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", 519 + &sk->sk_v6_daddr, ntohs(inet->inet_dport), 520 + inet->inet_num, tp->snd_una, tp->snd_nxt, 521 + jiffies_to_msecs(jiffies - tp->rcv_tstamp), 522 + rtx_delta); 536 523 } 537 524 #endif 538 - if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) { 525 + if (tcp_rtx_probe0_timed_out(sk, skb)) { 539 526 tcp_write_err(sk); 540 527 goto out; 541 528 }