Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'bpf-fix-fionread-and-copied_seq-issues'

Jiayuan Chen says:

====================
bpf: Fix FIONREAD and copied_seq issues

syzkaller reported a bug [1] where a socket using sockmap, after being
unloaded, exposed incorrect copied_seq calculation. The selftest I
provided can be used to reproduce the issue reported by syzkaller.

TCP recvmsg seq # bug 2: copied E92C873, seq E68D125, rcvnxt E7CEB7C, fl 40
WARNING: CPU: 1 PID: 5997 at net/ipv4/tcp.c:2724 tcp_recvmsg_locked+0xb2f/0x2910 net/ipv4/tcp.c:2724
Call Trace:
<TASK>
receive_fallback_to_copy net/ipv4/tcp.c:1968 [inline]
tcp_zerocopy_receive+0x131a/0x2120 net/ipv4/tcp.c:2200
do_tcp_getsockopt+0xe28/0x26c0 net/ipv4/tcp.c:4713
tcp_getsockopt+0xdf/0x100 net/ipv4/tcp.c:4812
do_sock_getsockopt+0x34d/0x440 net/socket.c:2421
__sys_getsockopt+0x12f/0x260 net/socket.c:2450
__do_sys_getsockopt net/socket.c:2457 [inline]
__se_sys_getsockopt net/socket.c:2454 [inline]
__x64_sys_getsockopt+0xbd/0x160 net/socket.c:2454
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xcd/0xfa0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f

A sockmap socket maintains its own receive queue (ingress_msg) which may
contain data from either its own protocol stack or forwarded from other
sockets.

FD1:read()
-- FD1->copied_seq++
| [read data]
|
[enqueue data] v
[sockmap] -> ingress to self -> ingress_msg queue
FD1 native stack ------> ^
-- FD1->rcv_nxt++ -> redirect to other | [enqueue data]
| |
| ingress to FD1
v ^
... | [sockmap]
FD2 native stack

The issue occurs when reading from ingress_msg: we update tp->copied_seq
by default, but if the data comes from other sockets (not the socket's
own protocol stack), tcp->rcv_nxt remains unchanged. Later, when
converting back to a native socket, reads may fail as copied_seq could
be significantly larger than rcv_nxt.

Additionally, FIONREAD calculation based on copied_seq and rcv_nxt is
insufficient for sockmap sockets, requiring separate field tracking.

[1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983
---
v7 -> v9: Address Jakub Sitnicki's feedback:
- Remove sk_receive_queue check in tcp_bpf_ioctl, only report
ingress_msg data length for FIONREAD
- Minor nits fixes
- Add Reviewed-by tag from John Fastabend
- Fix ci error
https://lore.kernel.org/bpf/20260113025121.197535-1-jiayuan.chen@linux.dev/

v5 -> v7: Some modifications suggested by Jakub Sitnicki, and added Reviewed-by tag.
https://lore.kernel.org/bpf/20260106051458.279151-1-jiayuan.chen@linux.dev/

v1 -> v5: Use skmsg.sk instead of extending BPF_F_XXX macro and fix CI
failure reported by CI
v1: https://lore.kernel.org/bpf/20251117110736.293040-1-jiayuan.chen@linux.dev/
====================

Link: https://patch.msgid.link/20260124113314.113584-1-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>

+439 -17
+68 -2
include/linux/skmsg.h
··· 97 97 struct sk_buff_head ingress_skb; 98 98 struct list_head ingress_msg; 99 99 spinlock_t ingress_lock; 100 + /** @msg_tot_len: Total bytes queued in ingress_msg list. */ 101 + u32 msg_tot_len; 100 102 unsigned long state; 101 103 struct list_head link; 102 104 spinlock_t link_lock; ··· 143 141 struct sk_msg *msg, u32 bytes); 144 142 int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 145 143 int len, int flags); 144 + int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 145 + int len, int flags, int *copied_from_self); 146 146 bool sk_msg_is_readable(struct sock *sk); 147 147 148 148 static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) ··· 323 319 kfree_skb(skb); 324 320 } 325 321 322 + static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock) 323 + { 324 + /* Used by ioctl to read msg_tot_len only; lock-free for performance */ 325 + return READ_ONCE(psock->msg_tot_len); 326 + } 327 + 328 + static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff) 329 + { 330 + /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock(). 331 + * ingress_lock should be held to prevent concurrent updates to msg_tot_len 332 + */ 333 + WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff); 334 + } 335 + 336 + static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff) 337 + { 338 + spin_lock_bh(&psock->ingress_lock); 339 + sk_psock_msg_len_add_locked(psock, diff); 340 + spin_unlock_bh(&psock->ingress_lock); 341 + } 342 + 326 343 static inline bool sk_psock_queue_msg(struct sk_psock *psock, 327 344 struct sk_msg *msg) 328 345 { ··· 352 327 spin_lock_bh(&psock->ingress_lock); 353 328 if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { 354 329 list_add_tail(&msg->list, &psock->ingress_msg); 330 + sk_psock_msg_len_add_locked(psock, msg->sg.size); 355 331 ret = true; 356 332 } else { 357 333 sk_msg_free(psock->sk, msg); ··· 369 343 370 344 spin_lock_bh(&psock->ingress_lock); 371 345 msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 372 - if (msg) 346 + if (msg) { 373 347 list_del(&msg->list); 348 + sk_psock_msg_len_add_locked(psock, -msg->sg.size); 349 + } 374 350 spin_unlock_bh(&psock->ingress_lock); 375 351 return msg; 352 + } 353 + 354 + static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock) 355 + { 356 + return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 376 357 } 377 358 378 359 static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) ··· 387 354 struct sk_msg *msg; 388 355 389 356 spin_lock_bh(&psock->ingress_lock); 390 - msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); 357 + msg = sk_psock_peek_msg_locked(psock); 391 358 spin_unlock_bh(&psock->ingress_lock); 392 359 return msg; 393 360 } ··· 552 519 if (!psock) 553 520 return false; 554 521 return !!psock->saved_data_ready; 522 + } 523 + 524 + /* for tcp only, sk is locked */ 525 + static inline ssize_t sk_psock_msg_inq(struct sock *sk) 526 + { 527 + struct sk_psock *psock; 528 + ssize_t inq = 0; 529 + 530 + psock = sk_psock_get(sk); 531 + if (likely(psock)) { 532 + inq = sk_psock_get_msg_len_nolock(psock); 533 + sk_psock_put(sk, psock); 534 + } 535 + return inq; 536 + } 537 + 538 + /* for udp only, sk is not locked */ 539 + static inline ssize_t sk_msg_first_len(struct sock *sk) 540 + { 541 + struct sk_psock *psock; 542 + struct sk_msg *msg; 543 + ssize_t inq = 0; 544 + 545 + psock = sk_psock_get(sk); 546 + if (likely(psock)) { 547 + spin_lock_bh(&psock->ingress_lock); 548 + msg = sk_psock_peek_msg_locked(psock); 549 + if (msg) 550 + inq = msg->sg.size; 551 + spin_unlock_bh(&psock->ingress_lock); 552 + sk_psock_put(sk, psock); 553 + } 554 + return inq; 555 555 } 556 556 557 557 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+27 -3
net/core/skmsg.c
··· 409 409 } 410 410 EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); 411 411 412 - /* Receive sk_msg from psock->ingress_msg to @msg. */ 413 - int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 414 - int len, int flags) 412 + int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 413 + int len, int flags, int *copied_from_self) 415 414 { 416 415 struct iov_iter *iter = &msg->msg_iter; 417 416 int peek = flags & MSG_PEEK; 418 417 struct sk_msg *msg_rx; 419 418 int i, copied = 0; 419 + bool from_self; 420 420 421 421 msg_rx = sk_psock_peek_msg(psock); 422 + if (copied_from_self) 423 + *copied_from_self = 0; 424 + 422 425 while (copied != len) { 423 426 struct scatterlist *sge; 424 427 425 428 if (unlikely(!msg_rx)) 426 429 break; 427 430 431 + from_self = msg_rx->sk == sk; 428 432 i = msg_rx->sg.start; 429 433 do { 430 434 struct page *page; ··· 447 443 } 448 444 449 445 copied += copy; 446 + if (from_self && copied_from_self) 447 + *copied_from_self += copy; 448 + 450 449 if (likely(!peek)) { 451 450 sge->offset += copy; 452 451 sge->length -= copy; ··· 458 451 atomic_sub(copy, &sk->sk_rmem_alloc); 459 452 } 460 453 msg_rx->sg.size -= copy; 454 + sk_psock_msg_len_add(psock, -copy); 461 455 462 456 if (!sge->length) { 463 457 sk_msg_iter_var_next(i); ··· 494 486 } 495 487 out: 496 488 return copied; 489 + } 490 + 491 + /* Receive sk_msg from psock->ingress_msg to @msg. */ 492 + int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, 493 + int len, int flags) 494 + { 495 + return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); 497 496 } 498 497 EXPORT_SYMBOL_GPL(sk_msg_recvmsg); 499 498 ··· 631 616 if (unlikely(!msg)) 632 617 return -EAGAIN; 633 618 skb_set_owner_r(skb, sk); 619 + 620 + /* This is used in tcp_bpf_recvmsg_parser() to determine whether the 621 + * data originates from the socket's own protocol stack. No need to 622 + * refcount sk because msg's lifetime is bound to sk via the ingress_msg. 623 + */ 624 + msg->sk = sk; 634 625 err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); 635 626 if (err < 0) 636 627 kfree(msg); ··· 822 801 list_del(&msg->list); 823 802 if (!msg->skb) 824 803 atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); 804 + sk_psock_msg_len_add(psock, -msg->sg.size); 825 805 sk_msg_free(psock->sk, msg); 826 806 kfree(msg); 827 807 } 808 + WARN_ON_ONCE(psock->msg_tot_len); 828 809 } 829 810 830 811 static void __sk_psock_zap_ingress(struct sk_psock *psock) ··· 932 909 sk_msg_compute_data_pointers(msg); 933 910 msg->sk = sk; 934 911 ret = bpf_prog_run_pin_on_cpu(prog, msg); 912 + msg->sk = NULL; 935 913 ret = sk_psock_map_verd(ret, msg->sk_redir); 936 914 psock->apply_bytes = msg->apply_bytes; 937 915 if (ret == __SK_REDIRECT) {
+23 -2
net/ipv4/tcp_bpf.c
··· 10 10 11 11 #include <net/inet_common.h> 12 12 #include <net/tls.h> 13 + #include <asm/ioctls.h> 13 14 14 15 void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) 15 16 { ··· 227 226 int peek = flags & MSG_PEEK; 228 227 struct sk_psock *psock; 229 228 struct tcp_sock *tcp; 229 + int copied_from_self = 0; 230 230 int copied = 0; 231 231 u32 seq; 232 232 ··· 264 262 } 265 263 266 264 msg_bytes_ready: 267 - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); 265 + copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); 268 266 /* The typical case for EFAULT is the socket was gracefully 269 267 * shutdown with a FIN pkt. So check here the other case is 270 268 * some error on copy_page_to_iter which would be unexpected. ··· 279 277 goto out; 280 278 } 281 279 } 282 - seq += copied; 280 + seq += copied_from_self; 283 281 if (!copied) { 284 282 long timeo; 285 283 int data; ··· 331 329 release_sock(sk); 332 330 sk_psock_put(sk, psock); 333 331 return copied; 332 + } 333 + 334 + static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) 335 + { 336 + bool slow; 337 + 338 + if (cmd != SIOCINQ) 339 + return tcp_ioctl(sk, cmd, karg); 340 + 341 + /* works similar as tcp_ioctl */ 342 + if (sk->sk_state == TCP_LISTEN) 343 + return -EINVAL; 344 + 345 + slow = lock_sock_fast(sk); 346 + *karg = sk_psock_msg_inq(sk); 347 + unlock_sock_fast(sk, slow); 348 + 349 + return 0; 334 350 } 335 351 336 352 static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ··· 629 609 prot[TCP_BPF_BASE].close = sock_map_close; 630 610 prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; 631 611 prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; 612 + prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; 632 613 633 614 prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; 634 615 prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+19 -4
net/ipv4/udp_bpf.c
··· 5 5 #include <net/sock.h> 6 6 #include <net/udp.h> 7 7 #include <net/inet_common.h> 8 + #include <asm/ioctls.h> 8 9 9 10 #include "udp_impl.h" 10 11 ··· 112 111 static DEFINE_SPINLOCK(udpv6_prot_lock); 113 112 static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; 114 113 114 + static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg) 115 + { 116 + if (cmd != SIOCINQ) 117 + return udp_ioctl(sk, cmd, karg); 118 + 119 + /* Since we don't hold a lock, sk_receive_queue may contain data. 120 + * BPF might only be processing this data at the moment. We only 121 + * care about the data in the ingress_msg here. 122 + */ 123 + *karg = sk_msg_first_len(sk); 124 + return 0; 125 + } 126 + 115 127 static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) 116 128 { 117 - *prot = *base; 118 - prot->close = sock_map_close; 119 - prot->recvmsg = udp_bpf_recvmsg; 120 - prot->sock_is_readable = sk_msg_is_readable; 129 + *prot = *base; 130 + prot->close = sock_map_close; 131 + prot->recvmsg = udp_bpf_recvmsg; 132 + prot->sock_is_readable = sk_msg_is_readable; 133 + prot->ioctl = udp_bpf_ioctl; 121 134 } 122 135 123 136 static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
+288 -6
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 // Copyright (c) 2020 Cloudflare 3 3 #include <error.h> 4 - #include <netinet/tcp.h> 4 + #include <linux/tcp.h> 5 + #include <linux/socket.h> 5 6 #include <sys/epoll.h> 6 7 7 8 #include "test_progs.h" ··· 22 21 23 22 #define TCP_REPAIR_ON 1 24 23 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ 24 + 25 + /** 26 + * SOL_TCP is defined in <netinet/tcp.h> (glibc), but the copybuf_address 27 + * field of tcp_zerocopy_receive is not yet included in older versions. 28 + * This workaround remains necessary until the glibc update propagates. 29 + */ 30 + #ifndef SOL_TCP 31 + #define SOL_TCP 6 32 + #endif 25 33 26 34 static int connected_socket_v4(void) 27 35 { ··· 546 536 } 547 537 548 538 549 - static void test_sockmap_skb_verdict_fionread(bool pass_prog) 539 + static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog) 550 540 { 551 541 int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; 552 542 int expected, zero = 0, sent, recvd, avail; 553 543 struct test_sockmap_pass_prog *pass = NULL; 554 544 struct test_sockmap_drop_prog *drop = NULL; 555 545 char buf[256] = "0123456789"; 546 + int split_len = sizeof(buf) / 2; 556 547 557 548 if (pass_prog) { 558 549 pass = test_sockmap_pass_prog__open_and_load(); ··· 561 550 return; 562 551 verdict = bpf_program__fd(pass->progs.prog_skb_verdict); 563 552 map = bpf_map__fd(pass->maps.sock_map_rx); 564 - expected = sizeof(buf); 553 + if (sotype == SOCK_DGRAM) 554 + expected = split_len; /* FIONREAD for UDP is different from TCP */ 555 + else 556 + expected = sizeof(buf); 565 557 } else { 566 558 drop = test_sockmap_drop_prog__open_and_load(); 567 559 if (!ASSERT_OK_PTR(drop, "open_and_load")) ··· 580 566 if (!ASSERT_OK(err, "bpf_prog_attach")) 581 567 goto out; 582 568 583 - err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); 569 + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); 584 570 if (!ASSERT_OK(err, "create_socket_pairs()")) 585 571 goto out; 586 572 ··· 588 574 if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) 589 575 goto out_close; 590 576 591 - sent = xsend(p1, &buf, sizeof(buf), 0); 592 - ASSERT_EQ(sent, sizeof(buf), "xsend(p0)"); 577 + sent = xsend(p1, &buf, split_len, 0); 578 + sent += xsend(p1, &buf, sizeof(buf) - split_len, 0); 579 + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); 593 580 err = ioctl(c1, FIONREAD, &avail); 594 581 ASSERT_OK(err, "ioctl(FIONREAD) error"); 595 582 ASSERT_EQ(avail, expected, "ioctl(FIONREAD)"); ··· 610 595 test_sockmap_pass_prog__destroy(pass); 611 596 else 612 597 test_sockmap_drop_prog__destroy(drop); 598 + } 599 + 600 + static void test_sockmap_skb_verdict_fionread(bool pass_prog) 601 + { 602 + do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog); 603 + do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog); 613 604 } 614 605 615 606 static void test_sockmap_skb_verdict_change_tail(void) ··· 1063 1042 xclose(map); 1064 1043 } 1065 1044 1045 + /* it is used to reproduce WARNING */ 1046 + static void test_sockmap_zc(void) 1047 + { 1048 + int map, err, sent, recvd, zero = 0, one = 1, on = 1; 1049 + char buf[10] = "0123456789", rcv[11], addr[100]; 1050 + struct test_sockmap_pass_prog *skel = NULL; 1051 + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; 1052 + struct tcp_zerocopy_receive zc; 1053 + socklen_t zc_len = sizeof(zc); 1054 + struct bpf_program *prog; 1055 + 1056 + skel = test_sockmap_pass_prog__open_and_load(); 1057 + if (!ASSERT_OK_PTR(skel, "open_and_load")) 1058 + return; 1059 + 1060 + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) 1061 + goto end; 1062 + 1063 + prog = skel->progs.prog_skb_verdict_ingress; 1064 + map = bpf_map__fd(skel->maps.sock_map_rx); 1065 + 1066 + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); 1067 + if (!ASSERT_OK(err, "bpf_prog_attach")) 1068 + goto end; 1069 + 1070 + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); 1071 + if (!ASSERT_OK(err, "bpf_map_update_elem")) 1072 + goto end; 1073 + 1074 + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); 1075 + if (!ASSERT_OK(err, "bpf_map_update_elem")) 1076 + goto end; 1077 + 1078 + sent = xsend(c0, buf, sizeof(buf), 0); 1079 + if (!ASSERT_EQ(sent, sizeof(buf), "xsend")) 1080 + goto end; 1081 + 1082 + /* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */ 1083 + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); 1084 + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)")) 1085 + goto end; 1086 + 1087 + /* uninstall sockmap of p1 */ 1088 + bpf_map_delete_elem(map, &one); 1089 + 1090 + /* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */ 1091 + sent = xsend(c1, buf, sizeof(buf) - 1, 0); 1092 + if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend")) 1093 + goto end; 1094 + 1095 + err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on)); 1096 + if (!ASSERT_OK(err, "setsockopt")) 1097 + goto end; 1098 + 1099 + memset(&zc, 0, sizeof(zc)); 1100 + zc.copybuf_address = (__u64)((unsigned long)addr); 1101 + zc.copybuf_len = sizeof(addr); 1102 + 1103 + err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); 1104 + if (!ASSERT_OK(err, "getsockopt")) 1105 + goto end; 1106 + 1107 + end: 1108 + if (c0 >= 0) 1109 + close(c0); 1110 + if (p0 >= 0) 1111 + close(p0); 1112 + if (c1 >= 0) 1113 + close(c1); 1114 + if (p1 >= 0) 1115 + close(p1); 1116 + test_sockmap_pass_prog__destroy(skel); 1117 + } 1118 + 1119 + /* it is used to check whether copied_seq of sk is correct */ 1120 + static void test_sockmap_copied_seq(bool strp) 1121 + { 1122 + int i, map, err, sent, recvd, zero = 0, one = 1; 1123 + struct test_sockmap_pass_prog *skel = NULL; 1124 + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; 1125 + char buf[10] = "0123456789", rcv[11]; 1126 + struct bpf_program *prog; 1127 + 1128 + skel = test_sockmap_pass_prog__open_and_load(); 1129 + if (!ASSERT_OK_PTR(skel, "open_and_load")) 1130 + return; 1131 + 1132 + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) 1133 + goto end; 1134 + 1135 + prog = skel->progs.prog_skb_verdict_ingress; 1136 + map = bpf_map__fd(skel->maps.sock_map_rx); 1137 + 1138 + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); 1139 + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) 1140 + goto end; 1141 + 1142 + if (strp) { 1143 + prog = skel->progs.prog_skb_verdict_ingress_strp; 1144 + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0); 1145 + if (!ASSERT_OK(err, "bpf_prog_attach parser")) 1146 + goto end; 1147 + } 1148 + 1149 + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); 1150 + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) 1151 + goto end; 1152 + 1153 + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); 1154 + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) 1155 + goto end; 1156 + 1157 + /* just trigger sockamp: data sent by c0 will be received by p1 */ 1158 + sent = xsend(c0, buf, sizeof(buf), 0); 1159 + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf")) 1160 + goto end; 1161 + 1162 + /* do partial read */ 1163 + recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1); 1164 + recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1); 1165 + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") || 1166 + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) 1167 + goto end; 1168 + 1169 + /* uninstall sockmap of p1 and p0 */ 1170 + err = bpf_map_delete_elem(map, &one); 1171 + if (!ASSERT_OK(err, "bpf_map_delete_elem(1)")) 1172 + goto end; 1173 + 1174 + err = bpf_map_delete_elem(map, &zero); 1175 + if (!ASSERT_OK(err, "bpf_map_delete_elem(0)")) 1176 + goto end; 1177 + 1178 + /* now all sockets become plain socket, they should still work */ 1179 + for (i = 0; i < 5; i++) { 1180 + /* test copied_seq of p1 by running tcp native stack */ 1181 + sent = xsend(c1, buf, sizeof(buf), 0); 1182 + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native")) 1183 + goto end; 1184 + 1185 + recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT); 1186 + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native")) 1187 + goto end; 1188 + 1189 + /* p0 previously redirected skb to p1, we also check copied_seq of p0 */ 1190 + sent = xsend(c0, buf, sizeof(buf), 0); 1191 + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native")) 1192 + goto end; 1193 + 1194 + recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT); 1195 + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native")) 1196 + goto end; 1197 + } 1198 + 1199 + end: 1200 + if (c0 >= 0) 1201 + close(c0); 1202 + if (p0 >= 0) 1203 + close(p0); 1204 + if (c1 >= 0) 1205 + close(c1); 1206 + if (p1 >= 0) 1207 + close(p1); 1208 + test_sockmap_pass_prog__destroy(skel); 1209 + } 1210 + 1211 + /* Wait until FIONREAD returns the expected value or timeout */ 1212 + static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms) 1213 + { 1214 + unsigned int elapsed = 0; 1215 + int avail = 0; 1216 + 1217 + while (elapsed < timeout_ms) { 1218 + if (ioctl(fd, FIONREAD, &avail) < 0) 1219 + return -errno; 1220 + if (avail >= expected) 1221 + return avail; 1222 + usleep(1000); 1223 + elapsed++; 1224 + } 1225 + return avail; 1226 + } 1227 + 1228 + /* it is used to send data to via native stack and BPF redirecting */ 1229 + static void test_sockmap_multi_channels(int sotype) 1230 + { 1231 + int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected; 1232 + struct test_sockmap_pass_prog *skel = NULL; 1233 + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; 1234 + char buf[10] = "0123456789", rcv[11]; 1235 + struct bpf_program *prog; 1236 + 1237 + skel = test_sockmap_pass_prog__open_and_load(); 1238 + if (!ASSERT_OK_PTR(skel, "open_and_load")) 1239 + return; 1240 + 1241 + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); 1242 + if (err) 1243 + goto end; 1244 + 1245 + prog = skel->progs.prog_skb_verdict_ingress; 1246 + map = bpf_map__fd(skel->maps.sock_map_rx); 1247 + 1248 + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); 1249 + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) 1250 + goto end; 1251 + 1252 + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); 1253 + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) 1254 + goto end; 1255 + 1256 + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); 1257 + if (!ASSERT_OK(err, "bpf_map_update_elem")) 1258 + goto end; 1259 + 1260 + /* send data to p1 via native stack */ 1261 + sent = xsend(c1, buf, 2, 0); 1262 + if (!ASSERT_EQ(sent, 2, "xsend(2)")) 1263 + goto end; 1264 + 1265 + avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC); 1266 + ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return"); 1267 + 1268 + /* send data to p1 via bpf redirecting */ 1269 + sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0); 1270 + if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)")) 1271 + goto end; 1272 + 1273 + /* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable 1274 + * here since it may return immediately if prior data is already queued. 1275 + */ 1276 + expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf); 1277 + avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC); 1278 + ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return"); 1279 + 1280 + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); 1281 + if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") || 1282 + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) 1283 + goto end; 1284 + end: 1285 + if (c0 >= 0) 1286 + close(c0); 1287 + if (p0 >= 0) 1288 + close(p0); 1289 + if (c1 >= 0) 1290 + close(c1); 1291 + if (p1 >= 0) 1292 + close(p1); 1293 + test_sockmap_pass_prog__destroy(skel); 1294 + } 1295 + 1066 1296 void test_sockmap_basic(void) 1067 1297 { 1068 1298 if (test__start_subtest("sockmap create_update_free")) ··· 1380 1108 test_sockmap_skb_verdict_vsock_poll(); 1381 1109 if (test__start_subtest("sockmap vsock unconnected")) 1382 1110 test_sockmap_vsock_unconnected(); 1111 + if (test__start_subtest("sockmap with zc")) 1112 + test_sockmap_zc(); 1113 + if (test__start_subtest("sockmap recover")) 1114 + test_sockmap_copied_seq(false); 1115 + if (test__start_subtest("sockmap recover with strp")) 1116 + test_sockmap_copied_seq(true); 1117 + if (test__start_subtest("sockmap tcp multi channels")) 1118 + test_sockmap_multi_channels(SOCK_STREAM); 1119 + if (test__start_subtest("sockmap udp multi channels")) 1120 + test_sockmap_multi_channels(SOCK_DGRAM); 1383 1121 }
+14
tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
··· 44 44 return SK_PASS; 45 45 } 46 46 47 + SEC("sk_skb/stream_verdict") 48 + int prog_skb_verdict_ingress(struct __sk_buff *skb) 49 + { 50 + int one = 1; 51 + 52 + return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS); 53 + } 54 + 55 + SEC("sk_skb/stream_parser") 56 + int prog_skb_verdict_ingress_strp(struct __sk_buff *skb) 57 + { 58 + return skb->len; 59 + } 60 + 47 61 char _license[] SEC("license") = "GPL";