Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

bpf, sockmap: Fix incorrect copied_seq calculation

A socket using sockmap has its own independent receive queue: ingress_msg.
This queue may contain data from its own protocol stack or from other
sockets.

The issue is that when reading from ingress_msg, we update tp->copied_seq
by default. However, if the data is not from its own protocol stack,
tcp->rcv_nxt is not increased. Later, if we convert this socket to a
native socket, reading from this socket may fail because copied_seq might
be significantly larger than rcv_nxt.

This fix also addresses the syzkaller-reported bug referenced in the
Closes tag.

This patch marks the skmsg objects in ingress_msg. When reading, we update
copied_seq only if the data is from its own protocol stack.

FD1:read()
-- FD1->copied_seq++
| [read data]
|
[enqueue data] v
[sockmap] -> ingress to self -> ingress_msg queue
FD1 native stack ------> ^
-- FD1->rcv_nxt++ -> redirect to other | [enqueue data]
| |
| ingress to FD1
v ^
... | [sockmap]
FD2 native stack

Closes: https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983
Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://lore.kernel.org/r/20260124113314.113584-2-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Jiayuan Chen and committed by
Alexei Starovoitov
b40cc5ad 1456ebb2

+29 -5
+2
include/linux/skmsg.h
··· 141 141 struct sk_msg *msg, u32 bytes); 142 142 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 143 143 int len, int flags); 144 + int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 145 + int len, int flags, int *copied_from_self); 144 146 bool sk_msg_is_readable(struct sock *sk); 145 147 146 148 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
+24 -3
net/core/skmsg.c
··· 409 409 } 410 410 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); 411 411 412 - /* Receive sk_msg from psock->ingress_msg to @msg. */ 413 - int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 414 - int len, int flags) 412 + int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 413 + int len, int flags, int *copied_from_self) 415 414 { 416 415 struct iov_iter *iter = &msg->msg_iter; 417 416 int peek = flags & MSG_PEEK; 418 417 struct sk_msg *msg_rx; 419 418 int i, copied = 0; 419 + bool from_self; 420 420 421 421 msg_rx = sk_psock_peek_msg(psock); 422 + if (copied_from_self) 423 + *copied_from_self = 0; 424 + 422 425 while (copied != len) { 423 426 struct scatterlist *sge; 424 427 425 428 if (unlikely(!msg_rx)) 426 429 break; 427 430 431 + from_self = msg_rx->sk == sk; 428 432 i = msg_rx->sg.start; 429 433 do { 430 434 struct page *page; ··· 447 443 } 448 444 449 445 copied += copy; 446 + if (from_self && copied_from_self) 447 + *copied_from_self += copy; 448 + 450 449 if (likely(!peek)) { 451 450 sge->offset += copy; 452 451 sge->length -= copy; ··· 493 486 } 494 487 out: 495 488 return copied; 489 + } 490 + 491 + /* Receive sk_msg from psock->ingress_msg to @msg. */ 492 + int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 493 + int len, int flags) 494 + { 495 + return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); 496 496 } 497 497 EXPORT_SYMBOL_GPL(sk_msg_recvmsg); 498 498 ··· 630 616 if (unlikely(!msg)) 631 617 return -EAGAIN; 632 618 skb_set_owner_r(skb, sk); 619 + 620 + /* This is used in tcp_bpf_recvmsg_parser() to determine whether the 621 + * data originates from the socket's own protocol stack. No need to 622 + * refcount sk because msg's lifetime is bound to sk via the ingress_msg. 623 + */ 624 + msg->sk = sk; 633 625 err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); 634 626 if (err < 0) 635 627 kfree(msg); ··· 929 909 sk_msg_compute_data_pointers(msg); 930 910 msg->sk = sk; 931 911 ret = bpf_prog_run_pin_on_cpu(prog, msg); 912 + msg->sk = NULL; 932 913 ret = sk_psock_map_verd(ret, msg->sk_redir); 933 914 psock->apply_bytes = msg->apply_bytes; 934 915 if (ret == __SK_REDIRECT) {
+3 -2
net/ipv4/tcp_bpf.c
··· 226 226 int peek = flags & MSG_PEEK; 227 227 struct sk_psock *psock; 228 228 struct tcp_sock *tcp; 229 + int copied_from_self = 0; 229 230 int copied = 0; 230 231 u32 seq; 231 232 ··· 263 262 } 264 263 265 264 msg_bytes_ready: 266 - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); 265 + copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); 267 266 /* The typical case for EFAULT is the socket was gracefully 268 267 * shutdown with a FIN pkt. So check here the other case is 269 268 * some error on copy_page_to_iter which would be unexpected. ··· 278 277 goto out; 279 278 } 280 279 } 281 - seq += copied; 280 + seq += copied_from_self; 282 281 if (!copied) { 283 282 long timeo; 284 283 int data;