Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

bpf, sockmap: Fix FIONREAD for sockmap

A socket using sockmap has its own independent receive queue: ingress_msg.
This queue may contain data from its own protocol stack or from other
sockets.

Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to
calculate FIONREAD is not enough.

This patch adds a new msg_tot_len field in the psock structure to record
the data length in ingress_msg. Additionally, we implement new ioctl
interfaces for TCP and UDP to intercept FIONREAD operations.

Note that we intentionally do not include sk_receive_queue data in the
FIONREAD result. Data in sk_receive_queue has not yet been processed by
the BPF verdict program, and may be redirected to other sockets or
dropped. Including it would create semantic ambiguity since this data
may never be readable by the user.

Unix and VSOCK sockets have similar issues, but fixing them is outside
the scope of this patch as it would require more intrusive changes.

Previous work by John Fastabend made some efforts towards FIONREAD support:
commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq")
Although the current patch is based on the previous work by John Fastabend,
it is acceptable for our Fixes tag to point to the same commit.

FD1:read()
-- FD1->copied_seq++
| [read data]
|
[enqueue data] v
[sockmap] -> ingress to self -> ingress_msg queue
FD1 native stack ------> ^
-- FD1->rcv_nxt++ -> redirect to other | [enqueue data]
| |
| ingress to FD1
v ^
... | [sockmap]
FD2 native stack

Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20260124113314.113584-3-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

authored by

Jiayuan Chen and committed by
Alexei Starovoitov
929e30f9 b40cc5ad

+108 -6
+66 -2
include/linux/skmsg.h
··· 97 97 struct sk_buff_head ingress_skb; 98 98 struct list_head ingress_msg; 99 99 spinlock_t ingress_lock; 100 + /** @msg_tot_len: Total bytes queued in ingress_msg list. */ 101 + u32 msg_tot_len; 100 102 unsigned long state; 101 103 struct list_head link; 102 104 spinlock_t link_lock; ··· 323 321 kfree_skb(skb); 324 322 } 325 323 324 + static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock) 325 + { 326 + /* Used by ioctl to read msg_tot_len only; lock-free for performance */ 327 + return READ_ONCE(psock->msg_tot_len); 328 + } 329 + 330 + static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff) 331 + { 332 + /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock(). 333 + * ingress_lock should be held to prevent concurrent updates to msg_tot_len 334 + */ 335 + WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff); 336 + } 337 + 338 + static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff) 339 + { 340 + spin_lock_bh(&psock->ingress_lock); 341 + sk_psock_msg_len_add_locked(psock, diff); 342 + spin_unlock_bh(&psock->ingress_lock); 343 + } 344 + 326 345 static inline bool sk_psock_queue_msg(struct sk_psock *psock, 327 346 struct sk_msg *msg) 328 347 { ··· 352 329 spin_lock_bh(&psock->ingress_lock); 353 330 if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 354 331 list_add_tail(&msg->list, &psock->ingress_msg); 332 + sk_psock_msg_len_add_locked(psock, msg->sg.size); 355 333 ret = true; 356 334 } else { 357 335 sk_msg_free(psock->sk, msg); ··· 369 345 370 346 spin_lock_bh(&psock->ingress_lock); 371 347 msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 372 - if (msg) 348 + if (msg) { 373 349 list_del(&msg->list); 350 + sk_psock_msg_len_add_locked(psock, -msg->sg.size); 351 + } 374 352 spin_unlock_bh(&psock->ingress_lock); 375 353 return msg; 354 + } 355 + 356 + static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock) 357 + { 358 + return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 376 359 } 377 360 378 361 static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) ··· 387 356 struct sk_msg *msg; 388 357 389 358 spin_lock_bh(&psock->ingress_lock); 390 - msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 359 + msg = sk_psock_peek_msg_locked(psock); 391 360 spin_unlock_bh(&psock->ingress_lock); 392 361 return msg; 393 362 } ··· 552 521 if (!psock) 553 522 return false; 554 523 return !!psock->saved_data_ready; 524 + } 525 + 526 + /* for tcp only, sk is locked */ 527 + static inline ssize_t sk_psock_msg_inq(struct sock *sk) 528 + { 529 + struct sk_psock *psock; 530 + ssize_t inq = 0; 531 + 532 + psock = sk_psock_get(sk); 533 + if (likely(psock)) { 534 + inq = sk_psock_get_msg_len_nolock(psock); 535 + sk_psock_put(sk, psock); 536 + } 537 + return inq; 538 + } 539 + 540 + /* for udp only, sk is not locked */ 541 + static inline ssize_t sk_msg_first_len(struct sock *sk) 542 + { 543 + struct sk_psock *psock; 544 + struct sk_msg *msg; 545 + ssize_t inq = 0; 546 + 547 + psock = sk_psock_get(sk); 548 + if (likely(psock)) { 549 + spin_lock_bh(&psock->ingress_lock); 550 + msg = sk_psock_peek_msg_locked(psock); 551 + if (msg) 552 + inq = msg->sg.size; 553 + spin_unlock_bh(&psock->ingress_lock); 554 + sk_psock_put(sk, psock); 555 + } 556 + return inq; 555 557 } 556 558 557 559 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+3
net/core/skmsg.c
··· 458 458 atomic_sub(copy, &sk->sk_rmem_alloc); 459 459 } 460 460 msg_rx->sg.size -= copy; 461 + sk_psock_msg_len_add(psock, -copy); 461 462 462 463 if (!sge->length) { 463 464 sk_msg_iter_var_next(i); ··· 822 821 list_del(&msg->list); 823 822 if (!msg->skb) 824 823 atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); 824 + sk_psock_msg_len_add(psock, -msg->sg.size); 825 825 sk_msg_free(psock->sk, msg); 826 826 kfree(msg); 827 827 } 828 + WARN_ON_ONCE(psock->msg_tot_len); 828 829 } 829 830 830 831 static void __sk_psock_zap_ingress(struct sk_psock *psock)
+20
net/ipv4/tcp_bpf.c
··· 10 10 11 11 #include <net/inet_common.h> 12 12 #include <net/tls.h> 13 + #include <asm/ioctls.h> 13 14 14 15 void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) 15 16 { ··· 333 332 return copied; 334 333 } 335 334 335 + static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) 336 + { 337 + bool slow; 338 + 339 + if (cmd != SIOCINQ) 340 + return tcp_ioctl(sk, cmd, karg); 341 + 342 + /* works similar as tcp_ioctl */ 343 + if (sk->sk_state == TCP_LISTEN) 344 + return -EINVAL; 345 + 346 + slow = lock_sock_fast(sk); 347 + *karg = sk_psock_msg_inq(sk); 348 + unlock_sock_fast(sk, slow); 349 + 350 + return 0; 351 + } 352 + 336 353 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, 337 354 int flags, int *addr_len) 338 355 { ··· 629 610 prot[TCP_BPF_BASE].close = sock_map_close; 630 611 prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; 631 612 prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; 613 + prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; 632 614 633 615 prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; 634 616 prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+19 -4
net/ipv4/udp_bpf.c
··· 5 5 #include <net/sock.h> 6 6 #include <net/udp.h> 7 7 #include <net/inet_common.h> 8 + #include <asm/ioctls.h> 8 9 9 10 #include "udp_impl.h" 10 11 ··· 112 111 static DEFINE_SPINLOCK(udpv6_prot_lock); 113 112 static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; 114 113 114 + static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg) 115 + { 116 + if (cmd != SIOCINQ) 117 + return udp_ioctl(sk, cmd, karg); 118 + 119 + /* Since we don't hold a lock, sk_receive_queue may contain data. 120 + * BPF might only be processing this data at the moment. We only 121 + * care about the data in the ingress_msg here. 122 + */ 123 + *karg = sk_msg_first_len(sk); 124 + return 0; 125 + } 126 + 115 127 static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) 116 128 { 117 - *prot = *base; 118 - prot->close = sock_map_close; 119 - prot->recvmsg = udp_bpf_recvmsg; 120 - prot->sock_is_readable = sk_msg_is_readable; 129 + *prot = *base; 130 + prot->close = sock_map_close; 131 + prot->recvmsg = udp_bpf_recvmsg; 132 + prot->sock_is_readable = sk_msg_is_readable; 133 + prot->ioctl = udp_bpf_ioctl; 121 134 } 122 135 123 136 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)