Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'zcrx-updates-6.19' into for-6.19/io_uring

Merge zcrx updates from Pavel:

"Zcrx updates for 6.19. It includes a bunch of small patches,
IORING_REGISTER_ZCRX_CTRL and RQ flushing (Patches 4-5) and David's
work on sharing zcrx b/w multiple io_uring instances."

Link: https://lore.kernel.org/io-uring/cover.1763029704.git.asml.silence@gmail.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* zcrx-updates-6.19:
io_uring/zcrx: share an ifq between rings
io_uring/zcrx: add io_fill_zcrx_offsets()
io_uring/zcrx: export zcrx via a file
io_uring/zcrx: move io_zcrx_scrub() and dependencies up
io_uring/zcrx: count zcrx users
io_uring/zcrx: add sync refill queue flushing
io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
io_uring/zcrx: elide passing msg flags
io_uring/zcrx: use folio_nr_pages() instead of shift operation
io_uring/zcrx: convert to use netmem_desc

+317 -61
+34
include/uapi/linux/io_uring.h
··· 697 697 /* query various aspects of io_uring, see linux/io_uring/query.h */ 698 698 IORING_REGISTER_QUERY = 35, 699 699 700 + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ 701 + IORING_REGISTER_ZCRX_CTRL = 36, 702 + 700 703 /* this goes last */ 701 704 IORING_REGISTER_LAST, 702 705 ··· 1063 1060 __u64 __resv2[2]; 1064 1061 }; 1065 1062 1063 + enum zcrx_reg_flags { 1064 + ZCRX_REG_IMPORT = 1, 1065 + }; 1066 + 1066 1067 /* 1067 1068 * Argument for IORING_REGISTER_ZCRX_IFQ 1068 1069 */ ··· 1083 1076 __u32 zcrx_id; 1084 1077 __u32 __resv2; 1085 1078 __u64 __resv[3]; 1079 + }; 1080 + 1081 + enum zcrx_ctrl_op { 1082 + ZCRX_CTRL_FLUSH_RQ, 1083 + ZCRX_CTRL_EXPORT, 1084 + 1085 + __ZCRX_CTRL_LAST, 1086 + }; 1087 + 1088 + struct zcrx_ctrl_flush_rq { 1089 + __u64 __resv[6]; 1090 + }; 1091 + 1092 + struct zcrx_ctrl_export { 1093 + __u32 zcrx_fd; 1094 + __u32 __resv1[11]; 1095 + }; 1096 + 1097 + struct zcrx_ctrl { 1098 + __u32 zcrx_id; 1099 + __u32 op; /* see enum zcrx_ctrl_op */ 1100 + __u64 __resv[2]; 1101 + 1102 + union { 1103 + struct zcrx_ctrl_export zc_export; 1104 + struct zcrx_ctrl_flush_rq zc_flush; 1105 + }; 1086 1106 }; 1087 1107 1088 1108 #ifdef __cplusplus
+2 -5
io_uring/net.c
··· 110 110 111 111 struct io_recvzc { 112 112 struct file *file; 113 - unsigned msg_flags; 114 113 u16 flags; 115 114 u32 len; 116 115 struct io_zcrx_ifq *ifq; ··· 1252 1253 1253 1254 zc->len = READ_ONCE(sqe->len); 1254 1255 zc->flags = READ_ONCE(sqe->ioprio); 1255 - zc->msg_flags = READ_ONCE(sqe->msg_flags); 1256 - if (zc->msg_flags) 1256 + if (READ_ONCE(sqe->msg_flags)) 1257 1257 return -EINVAL; 1258 1258 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1259 1259 return -EINVAL; ··· 1281 1283 return -ENOTSOCK; 1282 1284 1283 1285 len = zc->len; 1284 - ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1285 - issue_flags, &zc->len); 1286 + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); 1286 1287 if (len && zc->len == 0) { 1287 1288 io_req_set_res(req, 0, 0); 1288 1289
+3
io_uring/register.c
··· 815 815 case IORING_REGISTER_QUERY: 816 816 ret = io_query(ctx, arg, nr_args); 817 817 break; 818 + case IORING_REGISTER_ZCRX_CTRL: 819 + ret = io_zcrx_ctrl(ctx, arg, nr_args); 820 + break; 818 821 default: 819 822 ret = -EINVAL; 820 823 break;
+270 -56
io_uring/zcrx.c
··· 8 8 #include <linux/netdevice.h> 9 9 #include <linux/rtnetlink.h> 10 10 #include <linux/skbuff_ref.h> 11 + #include <linux/anon_inodes.h> 11 12 12 13 #include <net/page_pool/helpers.h> 13 14 #include <net/page_pool/memory_provider.h> ··· 171 170 if (folio == last_folio) 172 171 continue; 173 172 last_folio = folio; 174 - res += 1UL << folio_order(folio); 173 + res += folio_nr_pages(folio); 175 174 } 176 175 return res; 177 176 } ··· 345 344 atomic_inc(io_get_user_counter(niov)); 346 345 } 347 346 347 + static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) 348 + { 349 + offsets->head = offsetof(struct io_uring, head); 350 + offsets->tail = offsetof(struct io_uring, tail); 351 + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 352 + } 353 + 348 354 static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, 349 355 struct io_zcrx_ifq *ifq, 350 356 struct io_uring_zcrx_ifq_reg *reg, ··· 363 355 void *ptr; 364 356 int ret; 365 357 366 - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 358 + io_fill_zcrx_offsets(&reg->offsets); 359 + off = reg->offsets.rqes; 367 360 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 368 361 if (size > rd->size) 369 362 return -EINVAL; ··· 380 371 ifq->rq_ring = (struct io_uring *)ptr; 381 372 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 382 373 383 - reg->offsets.head = offsetof(struct io_uring, head); 384 - reg->offsets.tail = offsetof(struct io_uring, tail); 385 - reg->offsets.rqes = off; 386 374 return 0; 387 375 } 388 376 ··· 488 482 spin_lock_init(&ifq->rq_lock); 489 483 mutex_init(&ifq->pp_lock); 490 484 refcount_set(&ifq->refs, 1); 485 + refcount_set(&ifq->user_refs, 1); 491 486 return ifq; 492 487 } 493 488 ··· 550 543 io_zcrx_ifq_free(ifq); 551 544 } 552 545 546 + static void io_zcrx_return_niov_freelist(struct net_iov *niov) 547 + { 548 + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 549 + 550 + spin_lock_bh(&area->freelist_lock); 551 + area->freelist[area->free_count++] = net_iov_idx(niov); 552 + spin_unlock_bh(&area->freelist_lock); 553 + } 554 + 555 + static void io_zcrx_return_niov(struct net_iov *niov) 556 + { 557 + netmem_ref netmem = net_iov_to_netmem(niov); 558 + 559 + if (!niov->desc.pp) { 560 + /* copy fallback allocated niovs */ 561 + io_zcrx_return_niov_freelist(niov); 562 + return; 563 + } 564 + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); 565 + } 566 + 567 + static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 568 + { 569 + struct io_zcrx_area *area = ifq->area; 570 + int i; 571 + 572 + if (!area) 573 + return; 574 + 575 + /* Reclaim back all buffers given to the user space. */ 576 + for (i = 0; i < area->nia.num_niovs; i++) { 577 + struct net_iov *niov = &area->nia.niovs[i]; 578 + int nr; 579 + 580 + if (!atomic_read(io_get_user_counter(niov))) 581 + continue; 582 + nr = atomic_xchg(io_get_user_counter(niov), 0); 583 + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 584 + io_zcrx_return_niov(niov); 585 + } 586 + } 587 + 588 + static void zcrx_unregister(struct io_zcrx_ifq *ifq) 589 + { 590 + if (refcount_dec_and_test(&ifq->user_refs)) { 591 + io_close_queue(ifq); 592 + io_zcrx_scrub(ifq); 593 + } 594 + io_put_zcrx_ifq(ifq); 595 + } 596 + 553 597 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 554 598 unsigned int id) 555 599 { ··· 609 551 lockdep_assert_held(&ctx->mmap_lock); 610 552 611 553 return ifq ? &ifq->region : NULL; 554 + } 555 + 556 + static int zcrx_box_release(struct inode *inode, struct file *file) 557 + { 558 + struct io_zcrx_ifq *ifq = file->private_data; 559 + 560 + if (WARN_ON_ONCE(!ifq)) 561 + return -EFAULT; 562 + zcrx_unregister(ifq); 563 + return 0; 564 + } 565 + 566 + static const struct file_operations zcrx_box_fops = { 567 + .owner = THIS_MODULE, 568 + .release = zcrx_box_release, 569 + }; 570 + 571 + static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, 572 + struct zcrx_ctrl *ctrl, void __user *arg) 573 + { 574 + struct zcrx_ctrl_export *ce = &ctrl->zc_export; 575 + struct file *file; 576 + int fd = -1; 577 + 578 + if (!mem_is_zero(ce, sizeof(*ce))) 579 + return -EINVAL; 580 + fd = get_unused_fd_flags(O_CLOEXEC); 581 + if (fd < 0) 582 + return fd; 583 + 584 + ce->zcrx_fd = fd; 585 + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { 586 + put_unused_fd(fd); 587 + return -EFAULT; 588 + } 589 + 590 + refcount_inc(&ifq->refs); 591 + refcount_inc(&ifq->user_refs); 592 + 593 + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, 594 + ifq, O_CLOEXEC, NULL); 595 + if (IS_ERR(file)) { 596 + put_unused_fd(fd); 597 + zcrx_unregister(ifq); 598 + return PTR_ERR(file); 599 + } 600 + 601 + fd_install(fd, file); 602 + return 0; 603 + } 604 + 605 + static int import_zcrx(struct io_ring_ctx *ctx, 606 + struct io_uring_zcrx_ifq_reg __user *arg, 607 + struct io_uring_zcrx_ifq_reg *reg) 608 + { 609 + struct io_zcrx_ifq *ifq; 610 + struct file *file; 611 + int fd, ret; 612 + u32 id; 613 + 614 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 615 + return -EINVAL; 616 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 617 + return -EINVAL; 618 + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) 619 + return -EINVAL; 620 + 621 + fd = reg->if_idx; 622 + CLASS(fd, f)(fd); 623 + if (fd_empty(f)) 624 + return -EBADF; 625 + 626 + file = fd_file(f); 627 + if (file->f_op != &zcrx_box_fops || !file->private_data) 628 + return -EBADF; 629 + 630 + ifq = file->private_data; 631 + refcount_inc(&ifq->refs); 632 + refcount_inc(&ifq->user_refs); 633 + 634 + scoped_guard(mutex, &ctx->mmap_lock) { 635 + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 636 + if (ret) 637 + goto err; 638 + } 639 + 640 + reg->zcrx_id = id; 641 + io_fill_zcrx_offsets(&reg->offsets); 642 + if (copy_to_user(arg, reg, sizeof(*reg))) { 643 + ret = -EFAULT; 644 + goto err_xa_erase; 645 + } 646 + 647 + scoped_guard(mutex, &ctx->mmap_lock) { 648 + ret = -ENOMEM; 649 + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 650 + goto err_xa_erase; 651 + } 652 + 653 + return 0; 654 + err_xa_erase: 655 + scoped_guard(mutex, &ctx->mmap_lock) 656 + xa_erase(&ctx->zcrx_ctxs, id); 657 + err: 658 + zcrx_unregister(ifq); 659 + return ret; 612 660 } 613 661 614 662 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ··· 742 578 return -EINVAL; 743 579 if (copy_from_user(&reg, arg, sizeof(reg))) 744 580 return -EFAULT; 745 - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 746 - return -EFAULT; 747 581 if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || 748 582 reg.__resv2 || reg.zcrx_id) 749 583 return -EINVAL; 584 + if (reg.flags & ZCRX_REG_IMPORT) 585 + return import_zcrx(ctx, arg, &reg); 586 + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 587 + return -EFAULT; 750 588 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 751 589 return -EINVAL; 752 590 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { ··· 849 683 return &area->nia.niovs[niov_idx]; 850 684 } 851 685 852 - static void io_zcrx_return_niov_freelist(struct net_iov *niov) 853 - { 854 - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 855 - 856 - spin_lock_bh(&area->freelist_lock); 857 - area->freelist[area->free_count++] = net_iov_idx(niov); 858 - spin_unlock_bh(&area->freelist_lock); 859 - } 860 - 861 - static void io_zcrx_return_niov(struct net_iov *niov) 862 - { 863 - netmem_ref netmem = net_iov_to_netmem(niov); 864 - 865 - if (!niov->pp) { 866 - /* copy fallback allocated niovs */ 867 - io_zcrx_return_niov_freelist(niov); 868 - return; 869 - } 870 - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 871 - } 872 - 873 - static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 874 - { 875 - struct io_zcrx_area *area = ifq->area; 876 - int i; 877 - 878 - if (!area) 879 - return; 880 - 881 - /* Reclaim back all buffers given to the user space. */ 882 - for (i = 0; i < area->nia.num_niovs; i++) { 883 - struct net_iov *niov = &area->nia.niovs[i]; 884 - int nr; 885 - 886 - if (!atomic_read(io_get_user_counter(niov))) 887 - continue; 888 - nr = atomic_xchg(io_get_user_counter(niov), 0); 889 - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 890 - io_zcrx_return_niov(niov); 891 - } 892 - } 893 - 894 686 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 895 687 { 896 688 struct io_zcrx_ifq *ifq; ··· 865 741 } 866 742 if (!ifq) 867 743 break; 868 - 869 - io_close_queue(ifq); 870 - io_zcrx_scrub(ifq); 871 - io_put_zcrx_ifq(ifq); 744 + zcrx_unregister(ifq); 872 745 } 873 746 874 747 xa_destroy(&ctx->zcrx_ctxs); ··· 936 815 if (!page_pool_unref_and_test(netmem)) 937 816 continue; 938 817 939 - if (unlikely(niov->pp != pp)) { 818 + if (unlikely(niov->desc.pp != pp)) { 940 819 io_zcrx_return_niov(niov); 941 820 continue; 942 821 } ··· 1061 940 .nl_fill = io_pp_nl_fill, 1062 941 .uninstall = io_pp_uninstall, 1063 942 }; 943 + 944 + static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, 945 + struct io_zcrx_ifq *zcrx) 946 + { 947 + unsigned int mask = zcrx->rq_entries - 1; 948 + unsigned int i; 949 + 950 + guard(spinlock_bh)(&zcrx->rq_lock); 951 + 952 + nr = min(nr, io_zcrx_rqring_entries(zcrx)); 953 + for (i = 0; i < nr; i++) { 954 + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); 955 + struct net_iov *niov; 956 + 957 + if (!io_parse_rqe(rqe, zcrx, &niov)) 958 + break; 959 + netmem_array[i] = net_iov_to_netmem(niov); 960 + } 961 + 962 + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); 963 + return i; 964 + } 965 + 966 + #define ZCRX_FLUSH_BATCH 32 967 + 968 + static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) 969 + { 970 + unsigned i; 971 + 972 + for (i = 0; i < nr; i++) { 973 + netmem_ref netmem = netmems[i]; 974 + struct net_iov *niov = netmem_to_net_iov(netmem); 975 + 976 + if (!io_zcrx_put_niov_uref(niov)) 977 + continue; 978 + if (!page_pool_unref_and_test(netmem)) 979 + continue; 980 + io_zcrx_return_niov(niov); 981 + } 982 + } 983 + 984 + static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 985 + struct zcrx_ctrl *ctrl) 986 + { 987 + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; 988 + netmem_ref netmems[ZCRX_FLUSH_BATCH]; 989 + unsigned total = 0; 990 + unsigned nr; 991 + 992 + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) 993 + return -EINVAL; 994 + 995 + do { 996 + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); 997 + 998 + zcrx_return_buffers(netmems, nr); 999 + total += nr; 1000 + 1001 + if (fatal_signal_pending(current)) 1002 + break; 1003 + cond_resched(); 1004 + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); 1005 + 1006 + return 0; 1007 + } 1008 + 1009 + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 1010 + { 1011 + struct zcrx_ctrl ctrl; 1012 + struct io_zcrx_ifq *zcrx; 1013 + 1014 + if (nr_args) 1015 + return -EINVAL; 1016 + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) 1017 + return -EFAULT; 1018 + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) 1019 + return -EFAULT; 1020 + 1021 + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); 1022 + if (!zcrx) 1023 + return -ENXIO; 1024 + 1025 + switch (ctrl.op) { 1026 + case ZCRX_CTRL_FLUSH_RQ: 1027 + return zcrx_flush_rq(ctx, zcrx, &ctrl); 1028 + case ZCRX_CTRL_EXPORT: 1029 + return zcrx_export(ctx, zcrx, &ctrl, arg); 1030 + } 1031 + 1032 + return -EOPNOTSUPP; 1033 + } 1064 1034 1065 1035 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 1066 1036 struct io_zcrx_ifq *ifq, int off, int len) ··· 1294 1082 const skb_frag_t *frag, int off, int len) 1295 1083 { 1296 1084 struct net_iov *niov; 1085 + struct page_pool *pp; 1297 1086 1298 1087 if (unlikely(!skb_frag_is_net_iov(frag))) 1299 1088 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1300 1089 1301 1090 niov = netmem_to_net_iov(frag->netmem); 1302 - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1303 - io_pp_to_ifq(niov->pp) != ifq) 1091 + pp = niov->desc.pp; 1092 + 1093 + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) 1304 1094 return -EFAULT; 1305 1095 1306 1096 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
+8
io_uring/zcrx.h
··· 55 55 struct net_device *netdev; 56 56 netdevice_tracker netdev_tracker; 57 57 refcount_t refs; 58 + /* counts userspace facing users like io_uring */ 59 + refcount_t user_refs; 58 60 59 61 /* 60 62 * Page pool and net configuration lock, can be taken deeper in the ··· 67 65 }; 68 66 69 67 #if defined(CONFIG_IO_URING_ZCRX) 68 + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); 70 69 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 71 70 struct io_uring_zcrx_ifq_reg __user *arg); 72 71 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); ··· 95 92 unsigned int id) 96 93 { 97 94 return NULL; 95 + } 96 + static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, 97 + void __user *arg, unsigned nr_arg) 98 + { 99 + return -EOPNOTSUPP; 98 100 } 99 101 #endif 100 102