Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'tcp-fix-tcp_poll-races'

Eric Dumazet says:

====================
tcp: fix tcp_poll() races

Flakes in packetdrill tests stressing epoll_wait()
were root caused to bad ordering in tcp_write_err()

Precisely, we have to call sk_error_report() after
tcp_done().

When fixing this issue, we discovered tcp_abort(),
tcp_v4_err() and tcp_v6_err() had similar issues.

Since tcp_reset() has the correct ordering,
first patch takes part of it and creates
tcp_done_with_error() helper.
====================

Link: https://lore.kernel.org/r/20240528125253.1966136-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+31 -37
+1
include/net/tcp.h
··· 677 677 /* tcp_input.c */ 678 678 void tcp_rearm_rto(struct sock *sk); 679 679 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); 680 + void tcp_done_with_error(struct sock *sk, int err); 680 681 void tcp_reset(struct sock *sk, struct sk_buff *skb); 681 682 void tcp_fin(struct sock *sk); 682 683 void tcp_check_space(struct sock *sk);
+2 -6
net/ipv4/tcp.c
··· 598 598 */ 599 599 mask |= EPOLLOUT | EPOLLWRNORM; 600 600 } 601 - /* This barrier is coupled with smp_wmb() in tcp_reset() */ 601 + /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ 602 602 smp_rmb(); 603 603 if (READ_ONCE(sk->sk_err) || 604 604 !skb_queue_empty_lockless(&sk->sk_error_queue)) ··· 4576 4576 bh_lock_sock(sk); 4577 4577 4578 4578 if (!sock_flag(sk, SOCK_DEAD)) { 4579 - WRITE_ONCE(sk->sk_err, err); 4580 - /* This barrier is coupled with smp_rmb() in tcp_poll() */ 4581 - smp_wmb(); 4582 - sk_error_report(sk); 4583 4579 if (tcp_need_reset(sk->sk_state)) 4584 4580 tcp_send_active_reset(sk, GFP_ATOMIC, 4585 4581 SK_RST_REASON_NOT_SPECIFIED); 4586 - tcp_done(sk); 4582 + tcp_done_with_error(sk, err); 4587 4583 } 4588 4584 4589 4585 bh_unlock_sock(sk);
+21 -11
net/ipv4/tcp_input.c
··· 4436 4436 return SKB_NOT_DROPPED_YET; 4437 4437 } 4438 4438 4439 + 4440 + void tcp_done_with_error(struct sock *sk, int err) 4441 + { 4442 + /* This barrier is coupled with smp_rmb() in tcp_poll() */ 4443 + WRITE_ONCE(sk->sk_err, err); 4444 + smp_wmb(); 4445 + 4446 + tcp_write_queue_purge(sk); 4447 + tcp_done(sk); 4448 + 4449 + if (!sock_flag(sk, SOCK_DEAD)) 4450 + sk_error_report(sk); 4451 + } 4452 + EXPORT_SYMBOL(tcp_done_with_error); 4453 + 4439 4454 /* When we get a reset we do this. */ 4440 4455 void tcp_reset(struct sock *sk, struct sk_buff *skb) 4441 4456 { 4457 + int err; 4458 + 4442 4459 trace_tcp_receive_reset(sk); 4443 4460 4444 4461 /* mptcp can't tell us to ignore reset pkts, ··· 4467 4450 /* We want the right error as BSD sees it (and indeed as we do). */ 4468 4451 switch (sk->sk_state) { 4469 4452 case TCP_SYN_SENT: 4470 - WRITE_ONCE(sk->sk_err, ECONNREFUSED); 4453 + err = ECONNREFUSED; 4471 4454 break; 4472 4455 case TCP_CLOSE_WAIT: 4473 - WRITE_ONCE(sk->sk_err, EPIPE); 4456 + err = EPIPE; 4474 4457 break; 4475 4458 case TCP_CLOSE: 4476 4459 return; 4477 4460 default: 4478 - WRITE_ONCE(sk->sk_err, ECONNRESET); 4461 + err = ECONNRESET; 4479 4462 } 4480 - /* This barrier is coupled with smp_rmb() in tcp_poll() */ 4481 - smp_wmb(); 4482 - 4483 - tcp_write_queue_purge(sk); 4484 - tcp_done(sk); 4485 - 4486 - if (!sock_flag(sk, SOCK_DEAD)) 4487 - sk_error_report(sk); 4463 + tcp_done_with_error(sk, err); 4488 4464 } 4489 4465 4490 4466 /*
+3 -8
net/ipv4/tcp_ipv4.c
··· 611 611 612 612 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); 613 613 614 - if (!sock_owned_by_user(sk)) { 615 - WRITE_ONCE(sk->sk_err, err); 616 - 617 - sk_error_report(sk); 618 - 619 - tcp_done(sk); 620 - } else { 614 + if (!sock_owned_by_user(sk)) 615 + tcp_done_with_error(sk, err); 616 + else 621 617 WRITE_ONCE(sk->sk_err_soft, err); 622 - } 623 618 goto out; 624 619 } 625 620
+1 -5
net/ipv4/tcp_timer.c
··· 74 74 75 75 static void tcp_write_err(struct sock *sk) 76 76 { 77 - WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); 78 - sk_error_report(sk); 79 - 80 - tcp_write_queue_purge(sk); 81 - tcp_done(sk); 77 + tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); 82 78 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); 83 79 } 84 80
+3 -7
net/ipv6/tcp_ipv6.c
··· 490 490 491 491 ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); 492 492 493 - if (!sock_owned_by_user(sk)) { 494 - WRITE_ONCE(sk->sk_err, err); 495 - sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ 496 - 497 - tcp_done(sk); 498 - } else { 493 + if (!sock_owned_by_user(sk)) 494 + tcp_done_with_error(sk, err); 495 + else 499 496 WRITE_ONCE(sk->sk_err_soft, err); 500 - } 501 497 goto out; 502 498 case TCP_LISTEN: 503 499 break;