Merge tag 'for-6.10/io_uring-20240511' of git://git.kernel.dk/linux

+1 -1

drivers/net/tap.c

··· 754 754 skb_zcopy_init(skb, msg_control); 755 755 } else if (msg_control) { 756 756 struct ubuf_info *uarg = msg_control; 757 - uarg->callback(NULL, uarg, false); 757 + uarg->ops->complete(NULL, uarg, false); 758 758 } 759 759 760 760 dev_queue_xmit(skb);

+1 -1

drivers/net/tun.c

··· 1906 1906 skb_zcopy_init(skb, msg_control); 1907 1907 } else if (msg_control) { 1908 1908 struct ubuf_info *uarg = msg_control; 1909 - uarg->callback(NULL, uarg, false); 1909 + uarg->ops->complete(NULL, uarg, false); 1910 1910 } 1911 1911 1912 1912 skb_reset_network_header(skb);

+2 -3

drivers/net/xen-netback/common.h

··· 390 390 391 391 void xenvif_carrier_on(struct xenvif *vif); 392 392 393 - /* Callback from stack when TX packet can be released */ 394 - void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf, 395 - bool zerocopy_success); 393 + /* Callbacks from stack when TX packet can be released */ 394 + extern const struct ubuf_info_ops xenvif_ubuf_ops; 396 395 397 396 static inline pending_ring_idx_t nr_pending_reqs(struct xenvif_queue *queue) 398 397 {

+1 -1

drivers/net/xen-netback/interface.c

··· 593 593 594 594 for (i = 0; i < MAX_PENDING_REQS; i++) { 595 595 queue->pending_tx_info[i].callback_struct = (struct ubuf_info_msgzc) 596 - { { .callback = xenvif_zerocopy_callback }, 596 + { { .ops = &xenvif_ubuf_ops }, 597 597 { { .ctx = NULL, 598 598 .desc = i } } }; 599 599 queue->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;

+8 -3

drivers/net/xen-netback/netback.c

··· 1156 1156 uarg = skb_shinfo(skb)->destructor_arg; 1157 1157 /* increase inflight counter to offset decrement in callback */ 1158 1158 atomic_inc(&queue->inflight_packets); 1159 - uarg->callback(NULL, uarg, true); 1159 + uarg->ops->complete(NULL, uarg, true); 1160 1160 skb_shinfo(skb)->destructor_arg = NULL; 1161 1161 1162 1162 /* Fill the skb with the new (local) frags. */ ··· 1278 1278 return work_done; 1279 1279 } 1280 1280 1281 - void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base, 1282 - bool zerocopy_success) 1281 + static void xenvif_zerocopy_callback(struct sk_buff *skb, 1282 + struct ubuf_info *ubuf_base, 1283 + bool zerocopy_success) 1283 1284 { 1284 1285 unsigned long flags; 1285 1286 pending_ring_idx_t index; ··· 1312 1311 queue->stats.tx_zerocopy_fail++; 1313 1312 xenvif_skb_zerocopy_complete(queue); 1314 1313 } 1314 + 1315 + const struct ubuf_info_ops xenvif_ubuf_ops = { 1316 + .complete = xenvif_zerocopy_callback, 1317 + }; 1315 1318 1316 1319 static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue) 1317 1320 {

+11 -4

drivers/nvme/host/ioctl.c

··· 423 423 pdu->result = le64_to_cpu(nvme_req(req)->result.u64); 424 424 425 425 /* 426 - * For iopoll, complete it directly. 426 + * For iopoll, complete it directly. Note that using the uring_cmd 427 + * helper for this is safe only because we check blk_rq_is_poll(). 428 + * As that returns false if we're NOT on a polled queue, then it's 429 + * safe to use the polled completion helper. 430 + * 427 431 * Otherwise, move the completion to task work. 428 432 */ 429 - if (blk_rq_is_poll(req)) 430 - nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); 431 - else 433 + if (blk_rq_is_poll(req)) { 434 + if (pdu->bio) 435 + blk_rq_unmap_user(pdu->bio); 436 + io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); 437 + } else { 432 438 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); 439 + } 433 440 434 441 return RQ_END_IO_FREE; 435 442 }

+6 -2

drivers/vhost/net.c

··· 380 380 } 381 381 } 382 382 383 - static void vhost_zerocopy_callback(struct sk_buff *skb, 383 + static void vhost_zerocopy_complete(struct sk_buff *skb, 384 384 struct ubuf_info *ubuf_base, bool success) 385 385 { 386 386 struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); ··· 407 407 408 408 rcu_read_unlock_bh(); 409 409 } 410 + 411 + static const struct ubuf_info_ops vhost_ubuf_ops = { 412 + .complete = vhost_zerocopy_complete, 413 + }; 410 414 411 415 static inline unsigned long busy_clock(void) 412 416 { ··· 883 879 vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; 884 880 ubuf->ctx = nvq->ubufs; 885 881 ubuf->desc = nvq->upend_idx; 886 - ubuf->ubuf.callback = vhost_zerocopy_callback; 882 + ubuf->ubuf.ops = &vhost_ubuf_ops; 887 883 ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; 888 884 refcount_set(&ubuf->ubuf.refcnt, 1); 889 885 msg.msg_control = &ctl;

-6

include/linux/io_uring.h

··· 11 11 void __io_uring_free(struct task_struct *tsk); 12 12 void io_uring_unreg_ringfd(void); 13 13 const char *io_uring_get_opcode(u8 opcode); 14 - int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); 15 14 bool io_is_uring_fops(struct file *file); 16 15 17 16 static inline void io_uring_files_cancel(void) ··· 43 44 static inline const char *io_uring_get_opcode(u8 opcode) 44 45 { 45 46 return ""; 46 - } 47 - static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, 48 - unsigned int issue_flags) 49 - { 50 - return -EOPNOTSUPP; 51 47 } 52 48 static inline bool io_is_uring_fops(struct file *file) 53 49 {

+24

include/linux/io_uring/cmd.h

··· 26 26 #if defined(CONFIG_IO_URING) 27 27 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 28 28 struct iov_iter *iter, void *ioucmd); 29 + 30 + /* 31 + * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd 32 + * and the corresponding io_uring request. 33 + * 34 + * Note: the caller should never hard code @issue_flags and is only allowed 35 + * to pass the mask provided by the core io_uring code. 36 + */ 29 37 void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, 30 38 unsigned issue_flags); 39 + 31 40 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 32 41 void (*task_work_cb)(struct io_uring_cmd *, unsigned), 33 42 unsigned flags); 34 43 44 + /* 45 + * Note: the caller should never hard code @issue_flags and only use the 46 + * mask provided by the core io_uring code. 47 + */ 35 48 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, 36 49 unsigned int issue_flags); 37 50 ··· 68 55 { 69 56 } 70 57 #endif 58 + 59 + /* 60 + * Polled completions must ensure they are coming from a poll queue, and 61 + * hence are completed inside the usual poll handling loops. 62 + */ 63 + static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, 64 + ssize_t ret, ssize_t res2) 65 + { 66 + lockdep_assert(in_task()); 67 + io_uring_cmd_done(ioucmd, ret, res2, 0); 68 + } 71 69 72 70 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ 73 71 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,

+18

include/linux/io_uring/net.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + #ifndef _LINUX_IO_URING_NET_H 3 + #define _LINUX_IO_URING_NET_H 4 + 5 + struct io_uring_cmd; 6 + 7 + #if defined(CONFIG_IO_URING) 8 + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); 9 + 10 + #else 11 + static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, 12 + unsigned int issue_flags) 13 + { 14 + return -EOPNOTSUPP; 15 + } 16 + #endif 17 + 18 + #endif

+7 -12

include/linux/io_uring_types.h

··· 205 205 206 206 bool plug_started; 207 207 bool need_plug; 208 + bool cq_flush; 208 209 unsigned short submit_nr; 209 210 unsigned int cqes_count; 210 211 struct blk_plug plug; ··· 220 219 }; 221 220 222 221 struct io_alloc_cache { 223 - struct io_wq_work_node list; 222 + void **entries; 224 223 unsigned int nr_cached; 225 224 unsigned int max_cached; 226 225 size_t elem_size; ··· 300 299 struct io_hash_table cancel_table_locked; 301 300 struct io_alloc_cache apoll_cache; 302 301 struct io_alloc_cache netmsg_cache; 302 + struct io_alloc_cache rw_cache; 303 + struct io_alloc_cache uring_cache; 303 304 304 305 /* 305 306 * Any cancelable uring_cmd is added to this list in ··· 344 341 unsigned cq_last_tm_flush; 345 342 } ____cacheline_aligned_in_smp; 346 343 347 - struct io_uring_cqe completion_cqes[16]; 348 - 349 344 spinlock_t completion_lock; 350 - 351 - /* IRQ completion list, under ->completion_lock */ 352 - unsigned int locked_free_nr; 353 - struct io_wq_work_list locked_free_list; 354 345 355 346 struct list_head io_buffers_comp; 356 347 struct list_head cq_overflow_list; ··· 367 370 unsigned int file_alloc_end; 368 371 369 372 struct list_head io_buffers_cache; 370 - 371 - /* deferred free list, protected by ->uring_lock */ 372 - struct hlist_head io_buf_list; 373 373 374 374 /* Keep this last, we don't need it for the fast path */ 375 375 struct wait_queue_head poll_wq; ··· 432 438 }; 433 439 434 440 struct io_tw_state { 435 - /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ 436 - bool locked; 437 441 }; 438 442 439 443 enum { ··· 472 480 REQ_F_CAN_POLL_BIT, 473 481 REQ_F_BL_EMPTY_BIT, 474 482 REQ_F_BL_NO_RECYCLE_BIT, 483 + REQ_F_BUFFERS_COMMIT_BIT, 475 484 476 485 /* not a real bit, just to check we're not overflowing the space */ 477 486 __REQ_F_LAST_BIT, ··· 551 558 REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), 552 559 /* don't recycle provided buffers for this request */ 553 560 REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), 561 + /* buffer ring head needs incrementing on put */ 562 + REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), 554 563 }; 555 564 556 565 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);

+13 -8

include/linux/skbuff.h

··· 527 527 #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ 528 528 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) 529 529 530 + struct ubuf_info_ops { 531 + void (*complete)(struct sk_buff *, struct ubuf_info *, 532 + bool zerocopy_success); 533 + /* has to be compatible with skb_zcopy_set() */ 534 + int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg); 535 + }; 536 + 530 537 /* 531 538 * The callback notifies userspace to release buffers when skb DMA is done in 532 539 * lower device, the skb last reference should be 0 when calling this. ··· 543 536 * The desc field is used to track userspace buffer index. 544 537 */ 545 538 struct ubuf_info { 546 - void (*callback)(struct sk_buff *, struct ubuf_info *, 547 - bool zerocopy_success); 539 + const struct ubuf_info_ops *ops; 548 540 refcount_t refcnt; 549 541 u8 flags; 550 542 }; ··· 1668 1662 } 1669 1663 #endif 1670 1664 1665 + extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; 1666 + 1671 1667 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, 1672 1668 struct ubuf_info *uarg); 1673 1669 1674 1670 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); 1675 - 1676 - void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1677 - bool success); 1678 1671 1679 1672 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, 1680 1673 struct sk_buff *skb, struct iov_iter *from, ··· 1762 1757 static inline void net_zcopy_put(struct ubuf_info *uarg) 1763 1758 { 1764 1759 if (uarg) 1765 - uarg->callback(NULL, uarg, true); 1760 + uarg->ops->complete(NULL, uarg, true); 1766 1761 } 1767 1762 1768 1763 static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1769 1764 { 1770 1765 if (uarg) { 1771 - if (uarg->callback == msg_zerocopy_callback) 1766 + if (uarg->ops == &msg_zerocopy_ubuf_ops) 1772 1767 msg_zerocopy_put_abort(uarg, have_uref); 1773 1768 else if (have_uref) 1774 1769 net_zcopy_put(uarg); ··· 1782 1777 1783 1778 if (uarg) { 1784 1779 if (!skb_zcopy_is_nouarg(skb)) 1785 - uarg->callback(skb, uarg, zerocopy_success); 1780 + uarg->ops->complete(skb, uarg, zerocopy_success); 1786 1781 1787 1782 skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; 1788 1783 }

+28 -10

include/uapi/linux/io_uring.h

··· 72 72 __u32 waitid_flags; 73 73 __u32 futex_flags; 74 74 __u32 install_fd_flags; 75 + __u32 nop_flags; 75 76 }; 76 77 __u64 user_data; /* data to be passed back at completion time */ 77 78 /* pack this to avoid bogus arm OABI complaints */ ··· 116 115 */ 117 116 #define IORING_FILE_INDEX_ALLOC (~0U) 118 117 119 - enum { 118 + enum io_uring_sqe_flags_bit { 120 119 IOSQE_FIXED_FILE_BIT, 121 120 IOSQE_IO_DRAIN_BIT, 122 121 IOSQE_IO_LINK_BIT, ··· 352 351 * 0 is reported if zerocopy was actually possible. 353 352 * IORING_NOTIF_USAGE_ZC_COPIED if data was copied 354 353 * (at least partially). 354 + * 355 + * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send or 356 + * recv will grab as many buffers from the buffer 357 + * group ID given and send them all. The completion 358 + * result will be the number of buffers send, with 359 + * the starting buffer ID in cqe->flags as per 360 + * usual for provided buffer usage. The buffers 361 + * will be contigious from the starting buffer ID. 355 362 */ 356 363 #define IORING_RECVSEND_POLL_FIRST (1U << 0) 357 364 #define IORING_RECV_MULTISHOT (1U << 1) 358 365 #define IORING_RECVSEND_FIXED_BUF (1U << 2) 359 366 #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) 367 + #define IORING_RECVSEND_BUNDLE (1U << 4) 360 368 361 369 /* 362 370 * cqe.res for IORING_CQE_F_NOTIF if ··· 380 370 * accept flags stored in sqe->ioprio 381 371 */ 382 372 #define IORING_ACCEPT_MULTISHOT (1U << 0) 373 + #define IORING_ACCEPT_DONTWAIT (1U << 1) 374 + #define IORING_ACCEPT_POLL_FIRST (1U << 2) 383 375 384 376 /* 385 377 * IORING_OP_MSG_RING command types, stored in sqe->addr 386 378 */ 387 - enum { 379 + enum io_uring_msg_ring_flags { 388 380 IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */ 389 381 IORING_MSG_SEND_FD, /* send a registered fd to another ring */ 390 382 }; ··· 407 395 * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC 408 396 */ 409 397 #define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) 398 + 399 + /* 400 + * IORING_OP_NOP flags (sqe->nop_flags) 401 + * 402 + * IORING_NOP_INJECT_RESULT Inject result from sqe->result 403 + */ 404 + #define IORING_NOP_INJECT_RESULT (1U << 0) 410 405 411 406 /* 412 407 * IO completion data structure (Completion Queue Entry) ··· 444 425 #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 445 426 #define IORING_CQE_F_NOTIF (1U << 3) 446 427 447 - enum { 448 - IORING_CQE_BUFFER_SHIFT = 16, 449 - }; 428 + #define IORING_CQE_BUFFER_SHIFT 16 450 429 451 430 /* 452 431 * Magic offsets for the application to mmap the data it needs ··· 539 522 #define IORING_FEAT_CQE_SKIP (1U << 11) 540 523 #define IORING_FEAT_LINKED_FILE (1U << 12) 541 524 #define IORING_FEAT_REG_REG_RING (1U << 13) 525 + #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) 542 526 543 527 /* 544 528 * io_uring_register(2) opcodes and arguments 545 529 */ 546 - enum { 530 + enum io_uring_register_op { 547 531 IORING_REGISTER_BUFFERS = 0, 548 532 IORING_UNREGISTER_BUFFERS = 1, 549 533 IORING_REGISTER_FILES = 2, ··· 601 583 }; 602 584 603 585 /* io-wq worker categories */ 604 - enum { 586 + enum io_wq_type { 605 587 IO_WQ_BOUND, 606 588 IO_WQ_UNBOUND, 607 589 }; ··· 706 688 * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) 707 689 * to get a virtual mapping for the ring. 708 690 */ 709 - enum { 691 + enum io_uring_register_pbuf_ring_flags { 710 692 IOU_PBUF_RING_MMAP = 1, 711 693 }; 712 694 ··· 737 719 /* 738 720 * io_uring_restriction->opcode values 739 721 */ 740 - enum { 722 + enum io_uring_register_restriction_op { 741 723 /* Allow an io_uring_register(2) opcode */ 742 724 IORING_RESTRICTION_REGISTER_OP = 0, 743 725 ··· 793 775 /* 794 776 * Argument for IORING_OP_URING_CMD when file is a socket 795 777 */ 796 - enum { 778 + enum io_uring_socket_op { 797 779 SOCKET_URING_OP_SIOCINQ = 0, 798 780 SOCKET_URING_OP_SIOCOUTQ, 799 781 SOCKET_URING_OP_GETSOCKOPT,

+8 -7

io_uring/Makefile

··· 2 2 # 3 3 # Makefile for io_uring 4 4 5 - obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ 6 - sync.o advise.o filetable.o \ 7 - openclose.o uring_cmd.o epoll.o \ 8 - statx.o net.o msg_ring.o timeout.o \ 9 - sqpoll.o fdinfo.o tctx.o poll.o \ 10 - cancel.o kbuf.o rsrc.o rw.o opdef.o \ 11 - notif.o waitid.o register.o truncate.o 5 + obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ 6 + tctx.o filetable.o rw.o net.o poll.o \ 7 + uring_cmd.o openclose.o sqpoll.o \ 8 + xattr.o nop.o fs.o splice.o sync.o \ 9 + msg_ring.o advise.o openclose.o \ 10 + epoll.o statx.o timeout.o fdinfo.o \ 11 + cancel.o waitid.o register.o \ 12 + truncate.o memmap.o 12 13 obj-$(CONFIG_IO_WQ) += io-wq.o 13 14 obj-$(CONFIG_FUTEX) += futex.o 14 15 obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o

+27 -32

io_uring/alloc_cache.h

··· 4 4 /* 5 5 * Don't allow the cache to grow beyond this size. 6 6 */ 7 - #define IO_ALLOC_CACHE_MAX 512 8 - 9 - struct io_cache_entry { 10 - struct io_wq_work_node node; 11 - }; 7 + #define IO_ALLOC_CACHE_MAX 128 12 8 13 9 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, 14 - struct io_cache_entry *entry) 10 + void *entry) 15 11 { 16 12 if (cache->nr_cached < cache->max_cached) { 17 - cache->nr_cached++; 18 - wq_stack_add_head(&entry->node, &cache->list); 19 - kasan_mempool_poison_object(entry); 13 + if (!kasan_mempool_poison_object(entry)) 14 + return false; 15 + cache->entries[cache->nr_cached++] = entry; 20 16 return true; 21 17 } 22 18 return false; 23 19 } 24 20 25 - static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) 21 + static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) 26 22 { 27 - return !cache->list.next; 28 - } 23 + if (cache->nr_cached) { 24 + void *entry = cache->entries[--cache->nr_cached]; 29 25 30 - static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) 31 - { 32 - if (cache->list.next) { 33 - struct io_cache_entry *entry; 34 - 35 - entry = container_of(cache->list.next, struct io_cache_entry, node); 36 26 kasan_mempool_unpoison_object(entry, cache->elem_size); 37 - cache->list.next = cache->list.next->next; 38 - cache->nr_cached--; 39 27 return entry; 40 28 } 41 29 42 30 return NULL; 43 31 } 44 32 45 - static inline void io_alloc_cache_init(struct io_alloc_cache *cache, 33 + /* returns false if the cache was initialized properly */ 34 + static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, 46 35 unsigned max_nr, size_t size) 47 36 { 48 - cache->list.next = NULL; 49 - cache->nr_cached = 0; 50 - cache->max_cached = max_nr; 51 - cache->elem_size = size; 37 + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); 38 + if (cache->entries) { 39 + cache->nr_cached = 0; 40 + cache->max_cached = max_nr; 41 + cache->elem_size = size; 42 + return false; 43 + } 44 + return true; 52 45 } 53 46 54 47 static inline void io_alloc_cache_free(struct io_alloc_cache *cache, 55 - void (*free)(struct io_cache_entry *)) 48 + void (*free)(const void *)) 56 49 { 57 - while (1) { 58 - struct io_cache_entry *entry = io_alloc_cache_get(cache); 50 + void *entry; 59 51 60 - if (!entry) 61 - break; 52 + if (!cache->entries) 53 + return; 54 + 55 + while ((entry = io_alloc_cache_get(cache)) != NULL) 62 56 free(entry); 63 - } 64 - cache->nr_cached = 0; 57 + 58 + kvfree(cache->entries); 59 + cache->entries = NULL; 65 60 } 66 61 #endif

+1 -3

io_uring/cancel.c

··· 184 184 io_ring_submit_lock(ctx, issue_flags); 185 185 ret = -ENOENT; 186 186 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 187 - struct io_uring_task *tctx = node->task->io_uring; 188 - 189 - ret = io_async_cancel_one(tctx, cd); 187 + ret = io_async_cancel_one(node->task->io_uring, cd); 190 188 if (ret != -ENOENT) { 191 189 if (!all) 192 190 break;

+2 -2

io_uring/fdinfo.c

··· 50 50 * Caller holds a reference to the file already, we don't need to do 51 51 * anything else to get an extra reference. 52 52 */ 53 - __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 53 + __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) 54 54 { 55 - struct io_ring_ctx *ctx = f->private_data; 55 + struct io_ring_ctx *ctx = file->private_data; 56 56 struct io_overflow_cqe *ocqe; 57 57 struct io_rings *r = ctx->rings; 58 58 struct rusage sq_usage;

+2 -2

io_uring/filetable.c

··· 84 84 return ret; 85 85 86 86 file_slot->file_ptr = 0; 87 - io_file_bitmap_clear(&ctx->file_table, slot_index); 87 + } else { 88 + io_file_bitmap_set(&ctx->file_table, slot_index); 88 89 } 89 90 90 91 *io_get_tag_slot(ctx->file_data, slot_index) = 0; 91 92 io_fixed_file_set(file_slot, file); 92 - io_file_bitmap_set(&ctx->file_table, slot_index); 93 93 return 0; 94 94 } 95 95

+12 -18

io_uring/futex.c

··· 9 9 10 10 #include "../kernel/futex/futex.h" 11 11 #include "io_uring.h" 12 - #include "rsrc.h" 12 + #include "alloc_cache.h" 13 13 #include "futex.h" 14 14 15 15 struct io_futex { ··· 27 27 }; 28 28 29 29 struct io_futex_data { 30 - union { 31 - struct futex_q q; 32 - struct io_cache_entry cache; 33 - }; 30 + struct futex_q q; 34 31 struct io_kiocb *req; 35 32 }; 36 33 37 - void io_futex_cache_init(struct io_ring_ctx *ctx) 38 - { 39 - io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, 40 - sizeof(struct io_futex_data)); 41 - } 34 + #define IO_FUTEX_ALLOC_CACHE_MAX 32 42 35 43 - static void io_futex_cache_entry_free(struct io_cache_entry *entry) 36 + bool io_futex_cache_init(struct io_ring_ctx *ctx) 44 37 { 45 - kfree(container_of(entry, struct io_futex_data, cache)); 38 + return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, 39 + sizeof(struct io_futex_data)); 46 40 } 47 41 48 42 void io_futex_cache_free(struct io_ring_ctx *ctx) 49 43 { 50 - io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); 44 + io_alloc_cache_free(&ctx->futex_cache, kfree); 51 45 } 52 46 53 47 static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) ··· 57 63 struct io_ring_ctx *ctx = req->ctx; 58 64 59 65 io_tw_lock(ctx, ts); 60 - if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) 66 + if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) 61 67 kfree(ifd); 62 68 __io_futex_complete(req, ts); 63 69 } ··· 253 259 254 260 static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) 255 261 { 256 - struct io_cache_entry *entry; 262 + struct io_futex_data *ifd; 257 263 258 - entry = io_alloc_cache_get(&ctx->futex_cache); 259 - if (entry) 260 - return container_of(entry, struct io_futex_data, cache); 264 + ifd = io_alloc_cache_get(&ctx->futex_cache); 265 + if (ifd) 266 + return ifd; 261 267 262 268 return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); 263 269 }

+3 -2

io_uring/futex.h

··· 13 13 unsigned int issue_flags); 14 14 bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, 15 15 bool cancel_all); 16 - void io_futex_cache_init(struct io_ring_ctx *ctx); 16 + bool io_futex_cache_init(struct io_ring_ctx *ctx); 17 17 void io_futex_cache_free(struct io_ring_ctx *ctx); 18 18 #else 19 19 static inline int io_futex_cancel(struct io_ring_ctx *ctx, ··· 27 27 { 28 28 return false; 29 29 } 30 - static inline void io_futex_cache_init(struct io_ring_ctx *ctx) 30 + static inline bool io_futex_cache_init(struct io_ring_ctx *ctx) 31 31 { 32 + return false; 32 33 } 33 34 static inline void io_futex_cache_free(struct io_ring_ctx *ctx) 34 35 {

+34 -33

io_uring/io-wq.c

··· 25 25 #define WORKER_IDLE_TIMEOUT (5 * HZ) 26 26 27 27 enum { 28 - IO_WORKER_F_UP = 1, /* up and active */ 29 - IO_WORKER_F_RUNNING = 2, /* account as running */ 30 - IO_WORKER_F_FREE = 4, /* worker on free list */ 31 - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ 28 + IO_WORKER_F_UP = 0, /* up and active */ 29 + IO_WORKER_F_RUNNING = 1, /* account as running */ 30 + IO_WORKER_F_FREE = 2, /* worker on free list */ 31 + IO_WORKER_F_BOUND = 3, /* is doing bounded work */ 32 32 }; 33 33 34 34 enum { ··· 44 44 */ 45 45 struct io_worker { 46 46 refcount_t ref; 47 - unsigned flags; 47 + int create_index; 48 + unsigned long flags; 48 49 struct hlist_nulls_node nulls_node; 49 50 struct list_head all_list; 50 51 struct task_struct *task; 51 52 struct io_wq *wq; 52 53 53 54 struct io_wq_work *cur_work; 54 - struct io_wq_work *next_work; 55 55 raw_spinlock_t lock; 56 56 57 57 struct completion ref_done; 58 58 59 59 unsigned long create_state; 60 60 struct callback_head create_work; 61 - int create_index; 62 61 63 62 union { 64 63 struct rcu_head rcu; ··· 164 165 165 166 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) 166 167 { 167 - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); 168 + return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); 168 169 } 169 170 170 171 static void io_worker_ref_put(struct io_wq *wq) ··· 224 225 wait_for_completion(&worker->ref_done); 225 226 226 227 raw_spin_lock(&wq->lock); 227 - if (worker->flags & IO_WORKER_F_FREE) 228 + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) 228 229 hlist_nulls_del_rcu(&worker->nulls_node); 229 230 list_del_rcu(&worker->all_list); 230 231 raw_spin_unlock(&wq->lock); ··· 409 410 struct io_wq_acct *acct = io_wq_get_acct(worker); 410 411 struct io_wq *wq = worker->wq; 411 412 412 - if (!(worker->flags & IO_WORKER_F_UP)) 413 + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) 413 414 return; 414 415 415 416 if (!atomic_dec_and_test(&acct->nr_running)) ··· 429 430 */ 430 431 static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) 431 432 { 432 - if (worker->flags & IO_WORKER_F_FREE) { 433 - worker->flags &= ~IO_WORKER_F_FREE; 433 + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { 434 + clear_bit(IO_WORKER_F_FREE, &worker->flags); 434 435 raw_spin_lock(&wq->lock); 435 436 hlist_nulls_del_init_rcu(&worker->nulls_node); 436 437 raw_spin_unlock(&wq->lock); ··· 443 444 static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) 444 445 __must_hold(wq->lock) 445 446 { 446 - if (!(worker->flags & IO_WORKER_F_FREE)) { 447 - worker->flags |= IO_WORKER_F_FREE; 447 + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { 448 + set_bit(IO_WORKER_F_FREE, &worker->flags); 448 449 hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 449 450 } 450 451 } ··· 538 539 539 540 raw_spin_lock(&worker->lock); 540 541 worker->cur_work = work; 541 - worker->next_work = NULL; 542 542 raw_spin_unlock(&worker->lock); 543 543 } 544 544 ··· 562 564 * clear the stalled flag. 563 565 */ 564 566 work = io_get_next_work(acct, worker); 565 - raw_spin_unlock(&acct->lock); 566 567 if (work) { 567 - __io_worker_busy(wq, worker); 568 - 569 568 /* 570 569 * Make sure cancelation can find this, even before 571 570 * it becomes the active work. That avoids a window ··· 571 576 * current work item for this worker. 572 577 */ 573 578 raw_spin_lock(&worker->lock); 574 - worker->next_work = work; 579 + worker->cur_work = work; 575 580 raw_spin_unlock(&worker->lock); 576 - } else { 577 - break; 578 581 } 582 + 583 + raw_spin_unlock(&acct->lock); 584 + 585 + if (!work) 586 + break; 587 + 588 + __io_worker_busy(wq, worker); 589 + 579 590 io_assign_current_work(worker, work); 580 591 __set_current_state(TASK_RUNNING); 581 592 ··· 632 631 bool exit_mask = false, last_timeout = false; 633 632 char buf[TASK_COMM_LEN]; 634 633 635 - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); 634 + set_mask_bits(&worker->flags, 0, 635 + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); 636 636 637 637 snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); 638 638 set_task_comm(current, buf); ··· 697 695 698 696 if (!worker) 699 697 return; 700 - if (!(worker->flags & IO_WORKER_F_UP)) 698 + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) 701 699 return; 702 - if (worker->flags & IO_WORKER_F_RUNNING) 700 + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) 703 701 return; 704 - worker->flags |= IO_WORKER_F_RUNNING; 702 + set_bit(IO_WORKER_F_RUNNING, &worker->flags); 705 703 io_wq_inc_running(worker); 706 704 } 707 705 ··· 715 713 716 714 if (!worker) 717 715 return; 718 - if (!(worker->flags & IO_WORKER_F_UP)) 716 + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) 719 717 return; 720 - if (!(worker->flags & IO_WORKER_F_RUNNING)) 718 + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) 721 719 return; 722 720 723 - worker->flags &= ~IO_WORKER_F_RUNNING; 721 + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); 724 722 io_wq_dec_running(worker); 725 723 } 726 724 ··· 734 732 raw_spin_lock(&wq->lock); 735 733 hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 736 734 list_add_tail_rcu(&worker->all_list, &wq->all_list); 737 - worker->flags |= IO_WORKER_F_FREE; 735 + set_bit(IO_WORKER_F_FREE, &worker->flags); 738 736 raw_spin_unlock(&wq->lock); 739 737 wake_up_new_task(tsk); 740 738 } ··· 840 838 init_completion(&worker->ref_done); 841 839 842 840 if (index == IO_WQ_ACCT_BOUND) 843 - worker->flags |= IO_WORKER_F_BOUND; 841 + set_bit(IO_WORKER_F_BOUND, &worker->flags); 844 842 845 843 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 846 844 if (!IS_ERR(tsk)) { ··· 926 924 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) 927 925 { 928 926 struct io_wq_acct *acct = io_work_get_acct(wq, work); 927 + unsigned long work_flags = work->flags; 929 928 struct io_cb_cancel_data match; 930 - unsigned work_flags = work->flags; 931 929 bool do_create; 932 930 933 931 /* ··· 1007 1005 * may dereference the passed in work. 1008 1006 */ 1009 1007 raw_spin_lock(&worker->lock); 1010 - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || 1011 - __io_wq_worker_cancel(worker, match, worker->next_work)) 1008 + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) 1012 1009 match->nr_running++; 1013 1010 raw_spin_unlock(&worker->lock); 1014 1011

+122 -545

io_uring/io_uring.c

··· 63 63 #include <linux/sched/mm.h> 64 64 #include <linux/uaccess.h> 65 65 #include <linux/nospec.h> 66 - #include <linux/highmem.h> 67 66 #include <linux/fsnotify.h> 68 67 #include <linux/fadvise.h> 69 68 #include <linux/task_work.h> ··· 94 95 #include "waitid.h" 95 96 #include "futex.h" 96 97 #include "napi.h" 98 + #include "uring_cmd.h" 99 + #include "memmap.h" 97 100 98 101 #include "timeout.h" 99 102 #include "poll.h" ··· 171 170 .mode = 0644, 172 171 .proc_handler = proc_dointvec, 173 172 }, 174 - {}, 175 173 }; 176 174 #endif 177 - 178 - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) 179 - { 180 - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || 181 - ctx->submit_state.cqes_count) 182 - __io_submit_flush_completions(ctx); 183 - } 184 175 185 176 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 186 177 { ··· 246 253 fallback_work.work); 247 254 struct llist_node *node = llist_del_all(&ctx->fallback_llist); 248 255 struct io_kiocb *req, *tmp; 249 - struct io_tw_state ts = { .locked = true, }; 256 + struct io_tw_state ts = {}; 250 257 251 258 percpu_ref_get(&ctx->refs); 252 259 mutex_lock(&ctx->uring_lock); 253 260 llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 254 261 req->io_task_work.func(req, &ts); 255 - if (WARN_ON_ONCE(!ts.locked)) 256 - return; 257 262 io_submit_flush_completions(ctx); 258 263 mutex_unlock(&ctx->uring_lock); 259 264 percpu_ref_put(&ctx->refs); ··· 275 284 { 276 285 struct io_ring_ctx *ctx; 277 286 int hash_bits; 287 + bool ret; 278 288 279 289 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 280 290 if (!ctx) ··· 304 312 INIT_LIST_HEAD(&ctx->sqd_list); 305 313 INIT_LIST_HEAD(&ctx->cq_overflow_list); 306 314 INIT_LIST_HEAD(&ctx->io_buffers_cache); 307 - INIT_HLIST_HEAD(&ctx->io_buf_list); 308 - io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, 315 + ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, 309 316 sizeof(struct io_rsrc_node)); 310 - io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, 317 + ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, 311 318 sizeof(struct async_poll)); 312 - io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, 319 + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, 313 320 sizeof(struct io_async_msghdr)); 314 - io_futex_cache_init(ctx); 321 + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, 322 + sizeof(struct io_async_rw)); 323 + ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, 324 + sizeof(struct uring_cache)); 325 + ret |= io_futex_cache_init(ctx); 326 + if (ret) 327 + goto err; 315 328 init_completion(&ctx->ref_comp); 316 329 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 317 330 mutex_init(&ctx->uring_lock); ··· 334 337 init_llist_head(&ctx->work_llist); 335 338 INIT_LIST_HEAD(&ctx->tctx_list); 336 339 ctx->submit_state.free_list.next = NULL; 337 - INIT_WQ_LIST(&ctx->locked_free_list); 338 340 INIT_HLIST_HEAD(&ctx->waitid_list); 339 341 #ifdef CONFIG_FUTEX 340 342 INIT_HLIST_HEAD(&ctx->futex_list); ··· 345 349 346 350 return ctx; 347 351 err: 352 + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); 353 + io_alloc_cache_free(&ctx->apoll_cache, kfree); 354 + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 355 + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 356 + io_alloc_cache_free(&ctx->uring_cache, kfree); 357 + io_futex_cache_free(ctx); 348 358 kfree(ctx->cancel_table.hbs); 349 359 kfree(ctx->cancel_table_locked.hbs); 350 360 xa_destroy(&ctx->io_bl_xa); ··· 381 379 { 382 380 if (req->flags & REQ_F_BUFFER_SELECTED) { 383 381 spin_lock(&req->ctx->completion_lock); 384 - io_put_kbuf_comp(req); 382 + io_kbuf_drop(req); 385 383 spin_unlock(&req->ctx->completion_lock); 386 384 } 387 385 ··· 500 498 } 501 499 } 502 500 503 - void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) 501 + static void io_queue_iowq(struct io_kiocb *req) 504 502 { 505 503 struct io_kiocb *link = io_prep_linked_timeout(req); 506 504 struct io_uring_task *tctx = req->task->io_uring; ··· 668 666 io_commit_cqring_flush(ctx); 669 667 } 670 668 671 - static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) 672 - { 673 - struct io_overflow_cqe *ocqe; 674 - LIST_HEAD(list); 675 - 676 - spin_lock(&ctx->completion_lock); 677 - list_splice_init(&ctx->cq_overflow_list, &list); 678 - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 679 - spin_unlock(&ctx->completion_lock); 680 - 681 - while (!list_empty(&list)) { 682 - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); 683 - list_del(&ocqe->list); 684 - kfree(ocqe); 685 - } 686 - } 687 - 688 - static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) 669 + static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) 689 670 { 690 671 size_t cqe_size = sizeof(struct io_uring_cqe); 691 672 692 - if (__io_cqring_events(ctx) == ctx->cq_entries) 673 + lockdep_assert_held(&ctx->uring_lock); 674 + 675 + /* don't abort if we're dying, entries must get freed */ 676 + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) 693 677 return; 694 678 695 679 if (ctx->flags & IORING_SETUP_CQE32) ··· 686 698 struct io_uring_cqe *cqe; 687 699 struct io_overflow_cqe *ocqe; 688 700 689 - if (!io_get_cqe_overflow(ctx, &cqe, true)) 690 - break; 691 701 ocqe = list_first_entry(&ctx->cq_overflow_list, 692 702 struct io_overflow_cqe, list); 693 - memcpy(cqe, &ocqe->cqe, cqe_size); 703 + 704 + if (!dying) { 705 + if (!io_get_cqe_overflow(ctx, &cqe, true)) 706 + break; 707 + memcpy(cqe, &ocqe->cqe, cqe_size); 708 + } 694 709 list_del(&ocqe->list); 695 710 kfree(ocqe); 696 711 } ··· 705 714 io_cq_unlock_post(ctx); 706 715 } 707 716 708 - static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 717 + static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) 709 718 { 710 - /* iopoll syncs against uring_lock, not completion_lock */ 711 - if (ctx->flags & IORING_SETUP_IOPOLL) 712 - mutex_lock(&ctx->uring_lock); 713 - __io_cqring_overflow_flush(ctx); 714 - if (ctx->flags & IORING_SETUP_IOPOLL) 715 - mutex_unlock(&ctx->uring_lock); 719 + if (ctx->rings) 720 + __io_cqring_overflow_flush(ctx, true); 716 721 } 717 722 718 - static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) 723 + static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 719 724 { 720 - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 721 - io_cqring_do_overflow_flush(ctx); 725 + mutex_lock(&ctx->uring_lock); 726 + __io_cqring_overflow_flush(ctx, false); 727 + mutex_unlock(&ctx->uring_lock); 722 728 } 723 729 724 730 /* can be called by any task */ ··· 805 817 return true; 806 818 } 807 819 808 - void io_req_cqe_overflow(struct io_kiocb *req) 820 + static void io_req_cqe_overflow(struct io_kiocb *req) 809 821 { 810 822 io_cqring_event_overflow(req->ctx, req->cqe.user_data, 811 823 req->cqe.res, req->cqe.flags, ··· 878 890 return false; 879 891 } 880 892 881 - static void __io_flush_post_cqes(struct io_ring_ctx *ctx) 882 - __must_hold(&ctx->uring_lock) 883 - { 884 - struct io_submit_state *state = &ctx->submit_state; 885 - unsigned int i; 886 - 887 - lockdep_assert_held(&ctx->uring_lock); 888 - for (i = 0; i < state->cqes_count; i++) { 889 - struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; 890 - 891 - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { 892 - if (ctx->lockless_cq) { 893 - spin_lock(&ctx->completion_lock); 894 - io_cqring_event_overflow(ctx, cqe->user_data, 895 - cqe->res, cqe->flags, 0, 0); 896 - spin_unlock(&ctx->completion_lock); 897 - } else { 898 - io_cqring_event_overflow(ctx, cqe->user_data, 899 - cqe->res, cqe->flags, 0, 0); 900 - } 901 - } 902 - } 903 - state->cqes_count = 0; 904 - } 905 - 906 - static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, 907 - bool allow_overflow) 893 + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) 908 894 { 909 895 bool filled; 910 896 911 897 io_cq_lock(ctx); 912 898 filled = io_fill_cqe_aux(ctx, user_data, res, cflags); 913 - if (!filled && allow_overflow) 899 + if (!filled) 914 900 filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); 915 901 916 902 io_cq_unlock_post(ctx); 917 903 return filled; 918 904 } 919 905 920 - bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) 921 - { 922 - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); 923 - } 924 - 925 906 /* 926 907 * A helper for multishot requests posting additional CQEs. 927 908 * Should only be used from a task_work including IO_URING_F_MULTISHOT. 928 909 */ 929 - bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) 910 + bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) 930 911 { 931 912 struct io_ring_ctx *ctx = req->ctx; 932 - u64 user_data = req->cqe.user_data; 933 - struct io_uring_cqe *cqe; 913 + bool posted; 934 914 935 915 lockdep_assert(!io_wq_current_is_worker()); 936 - 937 - if (!defer) 938 - return __io_post_aux_cqe(ctx, user_data, res, cflags, false); 939 - 940 916 lockdep_assert_held(&ctx->uring_lock); 941 917 942 - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { 943 - __io_cq_lock(ctx); 944 - __io_flush_post_cqes(ctx); 945 - /* no need to flush - flush is deferred */ 946 - __io_cq_unlock_post(ctx); 947 - } 948 - 949 - /* For defered completions this is not as strict as it is otherwise, 950 - * however it's main job is to prevent unbounded posted completions, 951 - * and in that it works just as well. 952 - */ 953 - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 954 - return false; 955 - 956 - cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; 957 - cqe->user_data = user_data; 958 - cqe->res = res; 959 - cqe->flags = cflags; 960 - return true; 918 + __io_cq_lock(ctx); 919 + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); 920 + ctx->submit_state.cq_flush = true; 921 + __io_cq_unlock_post(ctx); 922 + return posted; 961 923 } 962 924 963 - static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) 925 + static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) 964 926 { 965 927 struct io_ring_ctx *ctx = req->ctx; 966 - struct io_rsrc_node *rsrc_node = NULL; 928 + 929 + /* 930 + * All execution paths but io-wq use the deferred completions by 931 + * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. 932 + */ 933 + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) 934 + return; 935 + 936 + /* 937 + * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires 938 + * the submitter task context, IOPOLL protects with uring_lock. 939 + */ 940 + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { 941 + req->io_task_work.func = io_req_task_complete; 942 + io_req_task_work_add(req); 943 + return; 944 + } 967 945 968 946 io_cq_lock(ctx); 969 947 if (!(req->flags & REQ_F_CQE_SKIP)) { 970 948 if (!io_fill_cqe_req(ctx, req)) 971 949 io_req_cqe_overflow(req); 972 950 } 973 - 974 - /* 975 - * If we're the last reference to this request, add to our locked 976 - * free_list cache. 977 - */ 978 - if (req_ref_put_and_test(req)) { 979 - if (req->flags & IO_REQ_LINK_FLAGS) { 980 - if (req->flags & IO_DISARM_MASK) 981 - io_disarm_next(req); 982 - if (req->link) { 983 - io_req_task_queue(req->link); 984 - req->link = NULL; 985 - } 986 - } 987 - io_put_kbuf_comp(req); 988 - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 989 - io_clean_op(req); 990 - io_put_file(req); 991 - 992 - rsrc_node = req->rsrc_node; 993 - /* 994 - * Selected buffer deallocation in io_clean_op() assumes that 995 - * we don't hold ->completion_lock. Clean them here to avoid 996 - * deadlocks. 997 - */ 998 - io_put_task_remote(req->task); 999 - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); 1000 - ctx->locked_free_nr++; 1001 - } 1002 951 io_cq_unlock_post(ctx); 1003 952 1004 - if (rsrc_node) { 1005 - io_ring_submit_lock(ctx, issue_flags); 1006 - io_put_rsrc_node(ctx, rsrc_node); 1007 - io_ring_submit_unlock(ctx, issue_flags); 1008 - } 1009 - } 1010 - 1011 - void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) 1012 - { 1013 - struct io_ring_ctx *ctx = req->ctx; 1014 - 1015 - if (ctx->task_complete && ctx->submitter_task != current) { 1016 - req->io_task_work.func = io_req_task_complete; 1017 - io_req_task_work_add(req); 1018 - } else if (!(issue_flags & IO_URING_F_UNLOCKED) || 1019 - !(ctx->flags & IORING_SETUP_IOPOLL)) { 1020 - __io_req_complete_post(req, issue_flags); 1021 - } else { 1022 - mutex_lock(&ctx->uring_lock); 1023 - __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); 1024 - mutex_unlock(&ctx->uring_lock); 1025 - } 953 + /* 954 + * We don't free the request here because we know it's called from 955 + * io-wq only, which holds a reference, so it cannot be the last put. 956 + */ 957 + req_ref_put(req); 1026 958 } 1027 959 1028 960 void io_req_defer_failed(struct io_kiocb *req, s32 res) ··· 973 1065 memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 974 1066 } 975 1067 976 - static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 977 - struct io_submit_state *state) 978 - { 979 - spin_lock(&ctx->completion_lock); 980 - wq_list_splice(&ctx->locked_free_list, &state->free_list); 981 - ctx->locked_free_nr = 0; 982 - spin_unlock(&ctx->completion_lock); 983 - } 984 - 985 1068 /* 986 1069 * A request might get retired back into the request caches even before opcode 987 1070 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. ··· 984 1085 { 985 1086 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 986 1087 void *reqs[IO_REQ_ALLOC_BATCH]; 987 - int ret, i; 988 - 989 - /* 990 - * If we have more than a batch's worth of requests in our IRQ side 991 - * locked cache, grab the lock and move them over to our submission 992 - * side cache. 993 - */ 994 - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { 995 - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); 996 - if (!io_req_cache_empty(ctx)) 997 - return true; 998 - } 1088 + int ret; 999 1089 1000 1090 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); 1001 1091 ··· 1000 1112 } 1001 1113 1002 1114 percpu_ref_get_many(&ctx->refs, ret); 1003 - for (i = 0; i < ret; i++) { 1004 - struct io_kiocb *req = reqs[i]; 1115 + while (ret--) { 1116 + struct io_kiocb *req = reqs[ret]; 1005 1117 1006 1118 io_preinit_req(req, ctx); 1007 1119 io_req_add_to_cache(req, ctx); ··· 1051 1163 return; 1052 1164 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1053 1165 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1054 - if (ts->locked) { 1055 - io_submit_flush_completions(ctx); 1056 - mutex_unlock(&ctx->uring_lock); 1057 - ts->locked = false; 1058 - } 1166 + 1167 + io_submit_flush_completions(ctx); 1168 + mutex_unlock(&ctx->uring_lock); 1059 1169 percpu_ref_put(&ctx->refs); 1060 1170 } 1061 1171 ··· 1077 1191 if (req->ctx != ctx) { 1078 1192 ctx_flush_and_put(ctx, &ts); 1079 1193 ctx = req->ctx; 1080 - /* if not contended, grab and improve batching */ 1081 - ts.locked = mutex_trylock(&ctx->uring_lock); 1194 + mutex_lock(&ctx->uring_lock); 1082 1195 percpu_ref_get(&ctx->refs); 1083 1196 } 1084 1197 INDIRECT_CALL_2(req->io_task_work.func, ··· 1338 1453 1339 1454 if (io_run_local_work_continue(ctx, ret, min_events)) 1340 1455 goto again; 1341 - if (ts->locked) { 1342 - io_submit_flush_completions(ctx); 1343 - if (io_run_local_work_continue(ctx, ret, min_events)) 1344 - goto again; 1345 - } 1456 + io_submit_flush_completions(ctx); 1457 + if (io_run_local_work_continue(ctx, ret, min_events)) 1458 + goto again; 1346 1459 1347 1460 trace_io_uring_local_work_run(ctx, ret, loops); 1348 1461 return ret; ··· 1349 1466 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, 1350 1467 int min_events) 1351 1468 { 1352 - struct io_tw_state ts = { .locked = true, }; 1353 - int ret; 1469 + struct io_tw_state ts = {}; 1354 1470 1355 1471 if (llist_empty(&ctx->work_llist)) 1356 1472 return 0; 1357 - 1358 - ret = __io_run_local_work(ctx, &ts, min_events); 1359 - /* shouldn't happen! */ 1360 - if (WARN_ON_ONCE(!ts.locked)) 1361 - mutex_lock(&ctx->uring_lock); 1362 - return ret; 1473 + return __io_run_local_work(ctx, &ts, min_events); 1363 1474 } 1364 1475 1365 1476 static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) ··· 1361 1484 struct io_tw_state ts = {}; 1362 1485 int ret; 1363 1486 1364 - ts.locked = mutex_trylock(&ctx->uring_lock); 1487 + mutex_lock(&ctx->uring_lock); 1365 1488 ret = __io_run_local_work(ctx, &ts, min_events); 1366 - if (ts.locked) 1367 - mutex_unlock(&ctx->uring_lock); 1368 - 1489 + mutex_unlock(&ctx->uring_lock); 1369 1490 return ret; 1370 1491 } 1371 1492 ··· 1380 1505 if (unlikely(req->task->flags & PF_EXITING)) 1381 1506 io_req_defer_failed(req, -EFAULT); 1382 1507 else if (req->flags & REQ_F_FORCE_ASYNC) 1383 - io_queue_iowq(req, ts); 1508 + io_queue_iowq(req); 1384 1509 else 1385 1510 io_queue_sqe(req); 1386 1511 } ··· 1425 1550 1426 1551 if (apoll->double_poll) 1427 1552 kfree(apoll->double_poll); 1428 - if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) 1553 + if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) 1429 1554 kfree(apoll); 1430 1555 req->flags &= ~REQ_F_POLLED; 1431 1556 } ··· 1435 1560 io_clean_op(req); 1436 1561 } 1437 1562 io_put_file(req); 1438 - 1439 - io_req_put_rsrc_locked(req, ctx); 1440 - 1563 + io_put_rsrc_node(ctx, req->rsrc_node); 1441 1564 io_put_task(req->task); 1565 + 1442 1566 node = req->comp_list.next; 1443 1567 io_req_add_to_cache(req, ctx); 1444 1568 } while (node); ··· 1450 1576 struct io_wq_work_node *node; 1451 1577 1452 1578 __io_cq_lock(ctx); 1453 - /* must come first to preserve CQE ordering in failure cases */ 1454 - if (state->cqes_count) 1455 - __io_flush_post_cqes(ctx); 1456 1579 __wq_list_for_each(node, &state->compl_reqs) { 1457 1580 struct io_kiocb *req = container_of(node, struct io_kiocb, 1458 1581 comp_list); ··· 1471 1600 io_free_batch_list(ctx, state->compl_reqs.first); 1472 1601 INIT_WQ_LIST(&state->compl_reqs); 1473 1602 } 1603 + ctx->submit_state.cq_flush = false; 1474 1604 } 1475 1605 1476 1606 static unsigned io_cqring_events(struct io_ring_ctx *ctx) ··· 1514 1642 unsigned int nr_events = 0; 1515 1643 unsigned long check_cq; 1516 1644 1645 + lockdep_assert_held(&ctx->uring_lock); 1646 + 1517 1647 if (!io_allowed_run_tw(ctx)) 1518 1648 return -EEXIST; 1519 1649 1520 1650 check_cq = READ_ONCE(ctx->check_cq); 1521 1651 if (unlikely(check_cq)) { 1522 1652 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 1523 - __io_cqring_overflow_flush(ctx); 1653 + __io_cqring_overflow_flush(ctx, false); 1524 1654 /* 1525 1655 * Similarly do not spin if we have not informed the user of any 1526 1656 * dropped CQE. ··· 1585 1711 1586 1712 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) 1587 1713 { 1588 - if (ts->locked) 1589 - io_req_complete_defer(req); 1590 - else 1591 - io_req_complete_post(req, IO_URING_F_UNLOCKED); 1714 + io_req_complete_defer(req); 1592 1715 } 1593 1716 1594 1717 /* ··· 1656 1785 1657 1786 bool io_alloc_async_data(struct io_kiocb *req) 1658 1787 { 1659 - WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); 1660 - req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); 1788 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1789 + 1790 + WARN_ON_ONCE(!def->async_size); 1791 + req->async_data = kmalloc(def->async_size, GFP_KERNEL); 1661 1792 if (req->async_data) { 1662 1793 req->flags |= REQ_F_ASYNC_DATA; 1663 1794 return false; 1664 1795 } 1665 1796 return true; 1666 - } 1667 - 1668 - int io_req_prep_async(struct io_kiocb *req) 1669 - { 1670 - const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; 1671 - const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1672 - 1673 - /* assign early for deferred execution for non-fixed file */ 1674 - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) 1675 - req->file = io_file_get_normal(req, req->cqe.fd); 1676 - if (!cdef->prep_async) 1677 - return 0; 1678 - if (WARN_ON_ONCE(req_has_async_data(req))) 1679 - return -EFAULT; 1680 - if (!def->manual_alloc) { 1681 - if (io_alloc_async_data(req)) 1682 - return -EAGAIN; 1683 - } 1684 - return cdef->prep_async(req); 1685 1797 } 1686 1798 1687 1799 static u32 io_get_sequence(struct io_kiocb *req) ··· 1947 2093 break; 1948 2094 case IO_APOLL_ABORTED: 1949 2095 io_kbuf_recycle(req, 0); 1950 - io_queue_iowq(req, NULL); 2096 + io_queue_iowq(req); 1951 2097 break; 1952 2098 case IO_APOLL_OK: 1953 2099 break; ··· 1984 2130 req->flags |= REQ_F_LINK; 1985 2131 io_req_defer_failed(req, req->cqe.res); 1986 2132 } else { 1987 - int ret = io_req_prep_async(req); 1988 - 1989 - if (unlikely(ret)) { 1990 - io_req_defer_failed(req, ret); 1991 - return; 1992 - } 1993 - 1994 2133 if (unlikely(req->ctx->drain_active)) 1995 2134 io_drain_req(req); 1996 2135 else 1997 - io_queue_iowq(req, NULL); 2136 + io_queue_iowq(req); 1998 2137 } 1999 2138 } 2000 2139 ··· 2193 2346 * conditions are true (normal request), then just queue it. 2194 2347 */ 2195 2348 if (unlikely(link->head)) { 2196 - ret = io_req_prep_async(req); 2197 - if (unlikely(ret)) 2198 - return io_submit_fail_init(sqe, req, ret); 2199 - 2200 2349 trace_io_uring_link(req, link->head); 2201 2350 link->last->link = req; 2202 2351 link->last = req; ··· 2440 2597 if (!llist_empty(&ctx->work_llist)) 2441 2598 io_run_local_work(ctx, min_events); 2442 2599 io_run_task_work(); 2443 - io_cqring_overflow_flush(ctx); 2444 - /* if user messes with these they will just get an early return */ 2600 + 2601 + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) 2602 + io_cqring_do_overflow_flush(ctx); 2445 2603 if (__io_cqring_events_user(ctx) >= min_events) 2446 2604 return 0; 2447 2605 ··· 2542 2698 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 2543 2699 } 2544 2700 2545 - void io_mem_free(void *ptr) 2546 - { 2547 - if (!ptr) 2548 - return; 2549 - 2550 - folio_put(virt_to_folio(ptr)); 2551 - } 2552 - 2553 - static void io_pages_free(struct page ***pages, int npages) 2554 - { 2555 - struct page **page_array = *pages; 2556 - int i; 2557 - 2558 - if (!page_array) 2559 - return; 2560 - 2561 - for (i = 0; i < npages; i++) 2562 - unpin_user_page(page_array[i]); 2563 - kvfree(page_array); 2564 - *pages = NULL; 2565 - } 2566 - 2567 - static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, 2568 - unsigned long uaddr, size_t size) 2569 - { 2570 - struct page **page_array; 2571 - unsigned int nr_pages; 2572 - void *page_addr; 2573 - int ret, i, pinned; 2574 - 2575 - *npages = 0; 2576 - 2577 - if (uaddr & (PAGE_SIZE - 1) || !size) 2578 - return ERR_PTR(-EINVAL); 2579 - 2580 - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2581 - if (nr_pages > USHRT_MAX) 2582 - return ERR_PTR(-EINVAL); 2583 - page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 2584 - if (!page_array) 2585 - return ERR_PTR(-ENOMEM); 2586 - 2587 - 2588 - pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 2589 - page_array); 2590 - if (pinned != nr_pages) { 2591 - ret = (pinned < 0) ? pinned : -EFAULT; 2592 - goto free_pages; 2593 - } 2594 - 2595 - page_addr = page_address(page_array[0]); 2596 - for (i = 0; i < nr_pages; i++) { 2597 - ret = -EINVAL; 2598 - 2599 - /* 2600 - * Can't support mapping user allocated ring memory on 32-bit 2601 - * archs where it could potentially reside in highmem. Just 2602 - * fail those with -EINVAL, just like we did on kernels that 2603 - * didn't support this feature. 2604 - */ 2605 - if (PageHighMem(page_array[i])) 2606 - goto free_pages; 2607 - 2608 - /* 2609 - * No support for discontig pages for now, should either be a 2610 - * single normal page, or a huge page. Later on we can add 2611 - * support for remapping discontig pages, for now we will 2612 - * just fail them with EINVAL. 2613 - */ 2614 - if (page_address(page_array[i]) != page_addr) 2615 - goto free_pages; 2616 - page_addr += PAGE_SIZE; 2617 - } 2618 - 2619 - *pages = page_array; 2620 - *npages = nr_pages; 2621 - return page_to_virt(page_array[0]); 2622 - 2623 - free_pages: 2624 - io_pages_free(&page_array, pinned > 0 ? pinned : 0); 2625 - return ERR_PTR(ret); 2626 - } 2627 - 2628 2701 static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, 2629 2702 size_t size) 2630 2703 { ··· 2559 2798 static void io_rings_free(struct io_ring_ctx *ctx) 2560 2799 { 2561 2800 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { 2562 - io_mem_free(ctx->rings); 2563 - io_mem_free(ctx->sq_sqes); 2801 + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, 2802 + true); 2803 + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, 2804 + true); 2564 2805 } else { 2565 2806 io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); 2566 2807 ctx->n_ring_pages = 0; 2567 2808 io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); 2568 2809 ctx->n_sqe_pages = 0; 2810 + vunmap(ctx->rings); 2811 + vunmap(ctx->sq_sqes); 2569 2812 } 2570 2813 2571 2814 ctx->rings = NULL; 2572 2815 ctx->sq_sqes = NULL; 2573 - } 2574 - 2575 - void *io_mem_alloc(size_t size) 2576 - { 2577 - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 2578 - void *ret; 2579 - 2580 - ret = (void *) __get_free_pages(gfp, get_order(size)); 2581 - if (ret) 2582 - return ret; 2583 - return ERR_PTR(-ENOMEM); 2584 2816 } 2585 2817 2586 2818 static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, ··· 2621 2867 int nr = 0; 2622 2868 2623 2869 mutex_lock(&ctx->uring_lock); 2624 - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); 2625 2870 2626 2871 while (!io_req_cache_empty(ctx)) { 2627 2872 req = io_extract_req(ctx); ··· 2630 2877 if (nr) 2631 2878 percpu_ref_put_many(&ctx->refs, nr); 2632 2879 mutex_unlock(&ctx->uring_lock); 2633 - } 2634 - 2635 - static void io_rsrc_node_cache_free(struct io_cache_entry *entry) 2636 - { 2637 - kfree(container_of(entry, struct io_rsrc_node, cache)); 2638 2880 } 2639 2881 2640 2882 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ··· 2646 2898 __io_sqe_files_unregister(ctx); 2647 2899 io_cqring_overflow_kill(ctx); 2648 2900 io_eventfd_unregister(ctx); 2649 - io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); 2901 + io_alloc_cache_free(&ctx->apoll_cache, kfree); 2650 2902 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 2903 + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 2904 + io_alloc_cache_free(&ctx->uring_cache, kfree); 2651 2905 io_futex_cache_free(ctx); 2652 2906 io_destroy_buffers(ctx); 2653 2907 mutex_unlock(&ctx->uring_lock); ··· 2665 2915 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 2666 2916 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 2667 2917 2668 - io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); 2918 + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); 2669 2919 if (ctx->mm_account) { 2670 2920 mmdrop(ctx->mm_account); 2671 2921 ctx->mm_account = NULL; 2672 2922 } 2673 2923 io_rings_free(ctx); 2674 - io_kbuf_mmap_list_free(ctx); 2675 2924 2676 2925 percpu_ref_exit(&ctx->refs); 2677 2926 free_uid(ctx->user); ··· 2894 3145 percpu_ref_kill(&ctx->refs); 2895 3146 xa_for_each(&ctx->personalities, index, creds) 2896 3147 io_unregister_personality(ctx, index); 2897 - if (ctx->rings) 2898 - io_poll_remove_all(ctx, NULL, true); 2899 3148 mutex_unlock(&ctx->uring_lock); 2900 - 2901 - /* 2902 - * If we failed setting up the ctx, we might not have any rings 2903 - * and therefore did not submit any requests 2904 - */ 2905 - if (ctx->rings) 2906 - io_kill_timeouts(ctx, NULL, true); 2907 3149 2908 3150 flush_delayed_work(&ctx->fallback_work); 2909 3151 ··· 2981 3241 return ret; 2982 3242 } 2983 3243 2984 - static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, 2985 - struct task_struct *task, bool cancel_all) 2986 - { 2987 - struct hlist_node *tmp; 2988 - struct io_kiocb *req; 2989 - bool ret = false; 2990 - 2991 - lockdep_assert_held(&ctx->uring_lock); 2992 - 2993 - hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, 2994 - hash_node) { 2995 - struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, 2996 - struct io_uring_cmd); 2997 - struct file *file = req->file; 2998 - 2999 - if (!cancel_all && req->task != task) 3000 - continue; 3001 - 3002 - if (cmd->flags & IORING_URING_CMD_CANCELABLE) { 3003 - /* ->sqe isn't available if no async data */ 3004 - if (!req_has_async_data(req)) 3005 - cmd->sqe = NULL; 3006 - file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); 3007 - ret = true; 3008 - } 3009 - } 3010 - io_submit_flush_completions(ctx); 3011 - 3012 - return ret; 3013 - } 3014 - 3015 3244 static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 3016 3245 struct task_struct *task, 3017 3246 bool cancel_all) ··· 3035 3326 ret |= io_kill_timeouts(ctx, task, cancel_all); 3036 3327 if (task) 3037 3328 ret |= io_run_task_work() > 0; 3329 + else 3330 + ret |= flush_delayed_work(&ctx->fallback_work); 3038 3331 return ret; 3039 3332 } 3040 3333 ··· 3135 3424 io_uring_cancel_generic(cancel_all, NULL); 3136 3425 } 3137 3426 3138 - static void *io_uring_validate_mmap_request(struct file *file, 3139 - loff_t pgoff, size_t sz) 3140 - { 3141 - struct io_ring_ctx *ctx = file->private_data; 3142 - loff_t offset = pgoff << PAGE_SHIFT; 3143 - struct page *page; 3144 - void *ptr; 3145 - 3146 - switch (offset & IORING_OFF_MMAP_MASK) { 3147 - case IORING_OFF_SQ_RING: 3148 - case IORING_OFF_CQ_RING: 3149 - /* Don't allow mmap if the ring was setup without it */ 3150 - if (ctx->flags & IORING_SETUP_NO_MMAP) 3151 - return ERR_PTR(-EINVAL); 3152 - ptr = ctx->rings; 3153 - break; 3154 - case IORING_OFF_SQES: 3155 - /* Don't allow mmap if the ring was setup without it */ 3156 - if (ctx->flags & IORING_SETUP_NO_MMAP) 3157 - return ERR_PTR(-EINVAL); 3158 - ptr = ctx->sq_sqes; 3159 - break; 3160 - case IORING_OFF_PBUF_RING: { 3161 - struct io_buffer_list *bl; 3162 - unsigned int bgid; 3163 - 3164 - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 3165 - bl = io_pbuf_get_bl(ctx, bgid); 3166 - if (IS_ERR(bl)) 3167 - return bl; 3168 - ptr = bl->buf_ring; 3169 - io_put_bl(ctx, bl); 3170 - break; 3171 - } 3172 - default: 3173 - return ERR_PTR(-EINVAL); 3174 - } 3175 - 3176 - page = virt_to_head_page(ptr); 3177 - if (sz > page_size(page)) 3178 - return ERR_PTR(-EINVAL); 3179 - 3180 - return ptr; 3181 - } 3182 - 3183 - #ifdef CONFIG_MMU 3184 - 3185 - static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 3186 - { 3187 - size_t sz = vma->vm_end - vma->vm_start; 3188 - unsigned long pfn; 3189 - void *ptr; 3190 - 3191 - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 3192 - if (IS_ERR(ptr)) 3193 - return PTR_ERR(ptr); 3194 - 3195 - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 3196 - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 3197 - } 3198 - 3199 - static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, 3200 - unsigned long addr, unsigned long len, 3201 - unsigned long pgoff, unsigned long flags) 3202 - { 3203 - void *ptr; 3204 - 3205 - /* 3206 - * Do not allow to map to user-provided address to avoid breaking the 3207 - * aliasing rules. Userspace is not able to guess the offset address of 3208 - * kernel kmalloc()ed memory area. 3209 - */ 3210 - if (addr) 3211 - return -EINVAL; 3212 - 3213 - ptr = io_uring_validate_mmap_request(filp, pgoff, len); 3214 - if (IS_ERR(ptr)) 3215 - return -ENOMEM; 3216 - 3217 - /* 3218 - * Some architectures have strong cache aliasing requirements. 3219 - * For such architectures we need a coherent mapping which aliases 3220 - * kernel memory *and* userspace memory. To achieve that: 3221 - * - use a NULL file pointer to reference physical memory, and 3222 - * - use the kernel virtual address of the shared io_uring context 3223 - * (instead of the userspace-provided address, which has to be 0UL 3224 - * anyway). 3225 - * - use the same pgoff which the get_unmapped_area() uses to 3226 - * calculate the page colouring. 3227 - * For architectures without such aliasing requirements, the 3228 - * architecture will return any suitable mapping because addr is 0. 3229 - */ 3230 - filp = NULL; 3231 - flags |= MAP_SHARED; 3232 - pgoff = 0; /* has been translated to ptr above */ 3233 - #ifdef SHM_COLOUR 3234 - addr = (uintptr_t) ptr; 3235 - pgoff = addr >> PAGE_SHIFT; 3236 - #else 3237 - addr = 0UL; 3238 - #endif 3239 - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 3240 - } 3241 - 3242 - #else /* !CONFIG_MMU */ 3243 - 3244 - static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 3245 - { 3246 - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; 3247 - } 3248 - 3249 - static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 3250 - { 3251 - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 3252 - } 3253 - 3254 - static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 3255 - unsigned long addr, unsigned long len, 3256 - unsigned long pgoff, unsigned long flags) 3257 - { 3258 - void *ptr; 3259 - 3260 - ptr = io_uring_validate_mmap_request(file, pgoff, len); 3261 - if (IS_ERR(ptr)) 3262 - return PTR_ERR(ptr); 3263 - 3264 - return (unsigned long) ptr; 3265 - } 3266 - 3267 - #endif /* !CONFIG_MMU */ 3268 - 3269 3427 static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) 3270 3428 { 3271 3429 if (flags & IORING_ENTER_EXT_ARG) { ··· 3227 3647 */ 3228 3648 ret = 0; 3229 3649 if (ctx->flags & IORING_SETUP_SQPOLL) { 3230 - io_cqring_overflow_flush(ctx); 3231 - 3232 3650 if (unlikely(ctx->sq_data->thread == NULL)) { 3233 3651 ret = -EOWNERDEAD; 3234 3652 goto out; ··· 3315 3737 static const struct file_operations io_uring_fops = { 3316 3738 .release = io_uring_release, 3317 3739 .mmap = io_uring_mmap, 3740 + .get_unmapped_area = io_uring_get_unmapped_area, 3318 3741 #ifndef CONFIG_MMU 3319 - .get_unmapped_area = io_uring_nommu_get_unmapped_area, 3320 3742 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 3321 - #else 3322 - .get_unmapped_area = io_uring_mmu_get_unmapped_area, 3323 3743 #endif 3324 3744 .poll = io_uring_poll, 3325 3745 #ifdef CONFIG_PROC_FS ··· 3346 3770 return -EOVERFLOW; 3347 3771 3348 3772 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3349 - rings = io_mem_alloc(size); 3773 + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); 3350 3774 else 3351 3775 rings = io_rings_map(ctx, p->cq_off.user_addr, size); 3352 3776 ··· 3371 3795 } 3372 3796 3373 3797 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3374 - ptr = io_mem_alloc(size); 3798 + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); 3375 3799 else 3376 3800 ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); 3377 3801 ··· 3570 3994 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 3571 3995 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 3572 3996 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 3573 - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; 3997 + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | 3998 + IORING_FEAT_RECVSEND_BUNDLE; 3574 3999 3575 4000 if (copy_to_user(params, p, sizeof(*p))) { 3576 4001 ret = -EFAULT;

+12 -21

io_uring/io_uring.h

··· 62 62 } 63 63 64 64 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 65 - void io_req_cqe_overflow(struct io_kiocb *req); 66 65 int io_run_task_work_sig(struct io_ring_ctx *ctx); 67 66 void io_req_defer_failed(struct io_kiocb *req, s32 res); 68 - void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); 69 67 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 70 - bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); 68 + bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); 71 69 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 72 - 73 - struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); 74 70 75 71 struct file *io_file_get_normal(struct io_kiocb *req, int fd); 76 72 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, ··· 75 79 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 76 80 bool io_alloc_async_data(struct io_kiocb *req); 77 81 void io_req_task_queue(struct io_kiocb *req); 78 - void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); 79 82 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); 80 83 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 81 84 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); ··· 92 97 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 93 98 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 94 99 void __io_submit_flush_completions(struct io_ring_ctx *ctx); 95 - int io_req_prep_async(struct io_kiocb *req); 96 100 97 101 struct io_wq_work *io_wq_free_work(struct io_wq_work *work); 98 102 void io_wq_submit_work(struct io_wq_work *work); ··· 104 110 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 105 111 bool cancel_all); 106 112 107 - void *io_mem_alloc(size_t size); 108 - void io_mem_free(void *ptr); 109 - 110 113 enum { 111 114 IO_EVENTFD_OP_SIGNAL_BIT, 112 115 IO_EVENTFD_OP_FREE_BIT, ··· 112 121 void io_eventfd_ops(struct rcu_head *rcu); 113 122 void io_activate_pollwq(struct io_ring_ctx *ctx); 114 123 115 - #if defined(CONFIG_PROVE_LOCKING) 116 124 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) 117 125 { 126 + #if defined(CONFIG_PROVE_LOCKING) 118 127 lockdep_assert(in_task()); 119 128 120 129 if (ctx->flags & IORING_SETUP_IOPOLL) { ··· 133 142 else 134 143 lockdep_assert(current == ctx->submitter_task); 135 144 } 136 - } 137 - #else 138 - static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) 139 - { 140 - } 141 145 #endif 146 + } 142 147 143 148 static inline void io_req_task_work_add(struct io_kiocb *req) 144 149 { 145 150 __io_req_task_work_add(req, 0); 151 + } 152 + 153 + static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) 154 + { 155 + if (!wq_list_empty(&ctx->submit_state.compl_reqs) || 156 + ctx->submit_state.cq_flush) 157 + __io_submit_flush_completions(ctx); 146 158 } 147 159 148 160 #define io_for_each_link(pos, head) \ ··· 334 340 335 341 static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 336 342 { 337 - return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); 343 + return task_work_pending(current) || !llist_empty(&ctx->work_llist); 338 344 } 339 345 340 346 static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) 341 347 { 342 - if (!ts->locked) { 343 - mutex_lock(&ctx->uring_lock); 344 - ts->locked = true; 345 - } 348 + lockdep_assert_held(&ctx->uring_lock); 346 349 } 347 350 348 351 /*

+189 -131

io_uring/kbuf.c

··· 7 7 #include <linux/slab.h> 8 8 #include <linux/namei.h> 9 9 #include <linux/poll.h> 10 + #include <linux/vmalloc.h> 10 11 #include <linux/io_uring.h> 11 12 12 13 #include <uapi/linux/io_uring.h> ··· 15 14 #include "io_uring.h" 16 15 #include "opdef.h" 17 16 #include "kbuf.h" 18 - 19 - #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) 17 + #include "memmap.h" 20 18 21 19 /* BIDs are addressed by a 16-bit field in a CQE */ 22 20 #define MAX_BIDS_PER_BGID (1 << 16) ··· 31 31 __u16 bid; 32 32 }; 33 33 34 - struct io_buf_free { 35 - struct hlist_node list; 36 - void *mem; 37 - size_t size; 38 - int inuse; 39 - }; 40 - 41 - static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, 42 - unsigned int bgid) 43 - { 44 - return xa_load(&ctx->io_bl_xa, bgid); 45 - } 46 - 47 34 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 48 35 unsigned int bgid) 49 36 { 50 37 lockdep_assert_held(&ctx->uring_lock); 51 38 52 - return __io_buffer_get_list(ctx, bgid); 39 + return xa_load(&ctx->io_bl_xa, bgid); 53 40 } 54 41 55 42 static int io_buffer_add_list(struct io_ring_ctx *ctx, ··· 117 130 return NULL; 118 131 } 119 132 133 + static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, 134 + struct io_buffer_list *bl, 135 + struct iovec *iov) 136 + { 137 + void __user *buf; 138 + 139 + buf = io_provided_buffer_select(req, len, bl); 140 + if (unlikely(!buf)) 141 + return -ENOBUFS; 142 + 143 + iov[0].iov_base = buf; 144 + iov[0].iov_len = *len; 145 + return 0; 146 + } 147 + 148 + static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, 149 + __u16 head, __u16 mask) 150 + { 151 + return &br->bufs[head & mask]; 152 + } 153 + 120 154 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 121 155 struct io_buffer_list *bl, 122 156 unsigned int issue_flags) ··· 153 145 if (head + 1 == tail) 154 146 req->flags |= REQ_F_BL_EMPTY; 155 147 156 - head &= bl->mask; 157 - /* mmaped buffers are always contig */ 158 - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { 159 - buf = &br->bufs[head]; 160 - } else { 161 - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); 162 - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; 163 - buf = page_address(bl->buf_pages[index]); 164 - buf += off; 165 - } 148 + buf = io_ring_head_to_buf(br, head, bl->mask); 166 149 if (*len == 0 || *len > buf->len) 167 150 *len = buf->len; 168 - req->flags |= REQ_F_BUFFER_RING; 151 + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; 169 152 req->buf_list = bl; 170 153 req->buf_index = buf->bid; 171 154 ··· 171 172 * the transfer completes (or if we get -EAGAIN and must poll of 172 173 * retry). 173 174 */ 175 + req->flags &= ~REQ_F_BUFFERS_COMMIT; 174 176 req->buf_list = NULL; 175 177 bl->head++; 176 178 } ··· 198 198 return ret; 199 199 } 200 200 201 - /* 202 - * Mark the given mapped range as free for reuse 203 - */ 204 - static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 205 - { 206 - struct io_buf_free *ibf; 201 + /* cap it at a reasonable 256, will be one page even for 4K */ 202 + #define PEEK_MAX_IMPORT 256 207 203 208 - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 209 - if (bl->buf_ring == ibf->mem) { 210 - ibf->inuse = 0; 211 - return; 212 - } 204 + static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, 205 + struct io_buffer_list *bl) 206 + { 207 + struct io_uring_buf_ring *br = bl->buf_ring; 208 + struct iovec *iov = arg->iovs; 209 + int nr_iovs = arg->nr_iovs; 210 + __u16 nr_avail, tail, head; 211 + struct io_uring_buf *buf; 212 + 213 + tail = smp_load_acquire(&br->tail); 214 + head = bl->head; 215 + nr_avail = min_t(__u16, tail - head, UIO_MAXIOV); 216 + if (unlikely(!nr_avail)) 217 + return -ENOBUFS; 218 + 219 + buf = io_ring_head_to_buf(br, head, bl->mask); 220 + if (arg->max_len) { 221 + int needed; 222 + 223 + needed = (arg->max_len + buf->len - 1) / buf->len; 224 + needed = min(needed, PEEK_MAX_IMPORT); 225 + if (nr_avail > needed) 226 + nr_avail = needed; 213 227 } 214 228 215 - /* can't happen... */ 216 - WARN_ON_ONCE(1); 229 + /* 230 + * only alloc a bigger array if we know we have data to map, eg not 231 + * a speculative peek operation. 232 + */ 233 + if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { 234 + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); 235 + if (unlikely(!iov)) 236 + return -ENOMEM; 237 + if (arg->mode & KBUF_MODE_FREE) 238 + kfree(arg->iovs); 239 + arg->iovs = iov; 240 + nr_iovs = nr_avail; 241 + } else if (nr_avail < nr_iovs) { 242 + nr_iovs = nr_avail; 243 + } 244 + 245 + /* set it to max, if not set, so we can use it unconditionally */ 246 + if (!arg->max_len) 247 + arg->max_len = INT_MAX; 248 + 249 + req->buf_index = buf->bid; 250 + do { 251 + /* truncate end piece, if needed */ 252 + if (buf->len > arg->max_len) 253 + buf->len = arg->max_len; 254 + 255 + iov->iov_base = u64_to_user_ptr(buf->addr); 256 + iov->iov_len = buf->len; 257 + iov++; 258 + 259 + arg->out_len += buf->len; 260 + arg->max_len -= buf->len; 261 + if (!arg->max_len) 262 + break; 263 + 264 + buf = io_ring_head_to_buf(br, ++head, bl->mask); 265 + } while (--nr_iovs); 266 + 267 + if (head == tail) 268 + req->flags |= REQ_F_BL_EMPTY; 269 + 270 + req->flags |= REQ_F_BUFFER_RING; 271 + req->buf_list = bl; 272 + return iov - arg->iovs; 273 + } 274 + 275 + int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, 276 + unsigned int issue_flags) 277 + { 278 + struct io_ring_ctx *ctx = req->ctx; 279 + struct io_buffer_list *bl; 280 + int ret = -ENOENT; 281 + 282 + io_ring_submit_lock(ctx, issue_flags); 283 + bl = io_buffer_get_list(ctx, req->buf_index); 284 + if (unlikely(!bl)) 285 + goto out_unlock; 286 + 287 + if (bl->is_buf_ring) { 288 + ret = io_ring_buffers_peek(req, arg, bl); 289 + /* 290 + * Don't recycle these buffers if we need to go through poll. 291 + * Nobody else can use them anyway, and holding on to provided 292 + * buffers for a send/write operation would happen on the app 293 + * side anyway with normal buffers. Besides, we already 294 + * committed them, they cannot be put back in the queue. 295 + */ 296 + if (ret > 0) { 297 + req->flags |= REQ_F_BL_NO_RECYCLE; 298 + req->buf_list->head += ret; 299 + } 300 + } else { 301 + ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); 302 + } 303 + out_unlock: 304 + io_ring_submit_unlock(ctx, issue_flags); 305 + return ret; 306 + } 307 + 308 + int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) 309 + { 310 + struct io_ring_ctx *ctx = req->ctx; 311 + struct io_buffer_list *bl; 312 + int ret; 313 + 314 + lockdep_assert_held(&ctx->uring_lock); 315 + 316 + bl = io_buffer_get_list(ctx, req->buf_index); 317 + if (unlikely(!bl)) 318 + return -ENOENT; 319 + 320 + if (bl->is_buf_ring) { 321 + ret = io_ring_buffers_peek(req, arg, bl); 322 + if (ret > 0) 323 + req->flags |= REQ_F_BUFFERS_COMMIT; 324 + return ret; 325 + } 326 + 327 + /* don't support multiple buffer selections for legacy */ 328 + return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); 217 329 } 218 330 219 331 static int __io_remove_buffers(struct io_ring_ctx *ctx, ··· 339 227 340 228 if (bl->is_buf_ring) { 341 229 i = bl->buf_ring->tail - bl->head; 342 - if (bl->is_mmap) { 343 - /* 344 - * io_kbuf_list_free() will free the page(s) at 345 - * ->release() time. 346 - */ 347 - io_kbuf_mark_free(ctx, bl); 348 - bl->buf_ring = NULL; 349 - bl->is_mmap = 0; 350 - } else if (bl->buf_nr_pages) { 230 + if (bl->buf_nr_pages) { 351 231 int j; 352 232 353 - for (j = 0; j < bl->buf_nr_pages; j++) 354 - unpin_user_page(bl->buf_pages[j]); 355 - kvfree(bl->buf_pages); 356 - bl->buf_pages = NULL; 357 - bl->buf_nr_pages = 0; 233 + if (!bl->is_mmap) { 234 + for (j = 0; j < bl->buf_nr_pages; j++) 235 + unpin_user_page(bl->buf_pages[j]); 236 + } 237 + io_pages_unmap(bl->buf_ring, &bl->buf_pages, 238 + &bl->buf_nr_pages, bl->is_mmap); 239 + bl->is_mmap = 0; 358 240 } 359 241 /* make sure it's seen as empty */ 360 242 INIT_LIST_HEAD(&bl->buf_list); ··· 604 498 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, 605 499 struct io_buffer_list *bl) 606 500 { 607 - struct io_uring_buf_ring *br; 501 + struct io_uring_buf_ring *br = NULL; 608 502 struct page **pages; 609 - int i, nr_pages; 503 + int nr_pages, ret; 610 504 611 505 pages = io_pin_pages(reg->ring_addr, 612 506 flex_array_size(br, bufs, reg->ring_entries), ··· 614 508 if (IS_ERR(pages)) 615 509 return PTR_ERR(pages); 616 510 617 - /* 618 - * Apparently some 32-bit boxes (ARM) will return highmem pages, 619 - * which then need to be mapped. We could support that, but it'd 620 - * complicate the code and slowdown the common cases quite a bit. 621 - * So just error out, returning -EINVAL just like we did on kernels 622 - * that didn't support mapped buffer rings. 623 - */ 624 - for (i = 0; i < nr_pages; i++) 625 - if (PageHighMem(pages[i])) 626 - goto error_unpin; 511 + br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 512 + if (!br) { 513 + ret = -ENOMEM; 514 + goto error_unpin; 515 + } 627 516 628 - br = page_address(pages[0]); 629 517 #ifdef SHM_COLOUR 630 518 /* 631 519 * On platforms that have specific aliasing requirements, SHM_COLOUR ··· 630 530 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle 631 531 * this transparently. 632 532 */ 633 - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) 533 + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { 534 + ret = -EINVAL; 634 535 goto error_unpin; 536 + } 635 537 #endif 636 538 bl->buf_pages = pages; 637 539 bl->buf_nr_pages = nr_pages; ··· 642 540 bl->is_mmap = 0; 643 541 return 0; 644 542 error_unpin: 645 - for (i = 0; i < nr_pages; i++) 646 - unpin_user_page(pages[i]); 543 + unpin_user_pages(pages, nr_pages); 647 544 kvfree(pages); 648 - return -EINVAL; 649 - } 650 - 651 - /* 652 - * See if we have a suitable region that we can reuse, rather than allocate 653 - * both a new io_buf_free and mem region again. We leave it on the list as 654 - * even a reused entry will need freeing at ring release. 655 - */ 656 - static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, 657 - size_t ring_size) 658 - { 659 - struct io_buf_free *ibf, *best = NULL; 660 - size_t best_dist; 661 - 662 - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 663 - size_t dist; 664 - 665 - if (ibf->inuse || ibf->size < ring_size) 666 - continue; 667 - dist = ibf->size - ring_size; 668 - if (!best || dist < best_dist) { 669 - best = ibf; 670 - if (!dist) 671 - break; 672 - best_dist = dist; 673 - } 674 - } 675 - 676 - return best; 545 + vunmap(br); 546 + return ret; 677 547 } 678 548 679 549 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, 680 550 struct io_uring_buf_reg *reg, 681 551 struct io_buffer_list *bl) 682 552 { 683 - struct io_buf_free *ibf; 684 553 size_t ring_size; 685 - void *ptr; 686 554 687 555 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 688 556 689 - /* Reuse existing entry, if we can */ 690 - ibf = io_lookup_buf_free_entry(ctx, ring_size); 691 - if (!ibf) { 692 - ptr = io_mem_alloc(ring_size); 693 - if (IS_ERR(ptr)) 694 - return PTR_ERR(ptr); 557 + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); 558 + if (!bl->buf_ring) 559 + return -ENOMEM; 695 560 696 - /* Allocate and store deferred free entry */ 697 - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); 698 - if (!ibf) { 699 - io_mem_free(ptr); 700 - return -ENOMEM; 701 - } 702 - ibf->mem = ptr; 703 - ibf->size = ring_size; 704 - hlist_add_head(&ibf->list, &ctx->io_buf_list); 705 - } 706 - ibf->inuse = 1; 707 - bl->buf_ring = ibf->mem; 708 561 bl->is_buf_ring = 1; 709 562 bl->is_mmap = 1; 710 563 return 0; ··· 807 750 return ERR_PTR(-EINVAL); 808 751 } 809 752 810 - /* 811 - * Called at or after ->release(), free the mmap'ed buffers that we used 812 - * for memory mapped provided buffer rings. 813 - */ 814 - void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) 753 + int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) 815 754 { 816 - struct io_buf_free *ibf; 817 - struct hlist_node *tmp; 755 + struct io_ring_ctx *ctx = file->private_data; 756 + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; 757 + struct io_buffer_list *bl; 758 + int bgid, ret; 818 759 819 - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { 820 - hlist_del(&ibf->list); 821 - io_mem_free(ibf->mem); 822 - kfree(ibf); 823 - } 760 + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 761 + bl = io_pbuf_get_bl(ctx, bgid); 762 + if (IS_ERR(bl)) 763 + return PTR_ERR(bl); 764 + 765 + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); 766 + io_put_bl(ctx, bl); 767 + return ret; 824 768 }

+47 -17

io_uring/kbuf.h

··· 41 41 __u16 bgid; 42 42 }; 43 43 44 + enum { 45 + /* can alloc a bigger vec */ 46 + KBUF_MODE_EXPAND = 1, 47 + /* if bigger vec allocated, free old one */ 48 + KBUF_MODE_FREE = 2, 49 + }; 50 + 51 + struct buf_sel_arg { 52 + struct iovec *iovs; 53 + size_t out_len; 54 + size_t max_len; 55 + int nr_iovs; 56 + int mode; 57 + }; 58 + 44 59 void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 45 60 unsigned int issue_flags); 61 + int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, 62 + unsigned int issue_flags); 63 + int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); 46 64 void io_destroy_buffers(struct io_ring_ctx *ctx); 47 65 48 66 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); ··· 73 55 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 74 56 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); 75 57 76 - void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); 77 - 78 58 void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); 79 59 80 60 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); ··· 80 64 void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); 81 65 struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, 82 66 unsigned long bgid); 67 + int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); 83 68 84 69 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) 85 70 { ··· 93 76 */ 94 77 if (req->buf_list) { 95 78 req->buf_index = req->buf_list->bgid; 96 - req->flags &= ~REQ_F_BUFFER_RING; 79 + req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); 97 80 return true; 98 81 } 99 82 return false; ··· 117 100 return false; 118 101 } 119 102 120 - static inline void __io_put_kbuf_ring(struct io_kiocb *req) 103 + static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) 121 104 { 122 - if (req->buf_list) { 123 - req->buf_index = req->buf_list->bgid; 124 - req->buf_list->head++; 105 + struct io_buffer_list *bl = req->buf_list; 106 + 107 + if (bl) { 108 + if (req->flags & REQ_F_BUFFERS_COMMIT) { 109 + bl->head += nr; 110 + req->flags &= ~REQ_F_BUFFERS_COMMIT; 111 + } 112 + req->buf_index = bl->bgid; 125 113 } 126 114 req->flags &= ~REQ_F_BUFFER_RING; 127 115 } ··· 135 113 struct list_head *list) 136 114 { 137 115 if (req->flags & REQ_F_BUFFER_RING) { 138 - __io_put_kbuf_ring(req); 116 + __io_put_kbuf_ring(req, 1); 139 117 } else { 140 118 req->buf_index = req->kbuf->bgid; 141 119 list_add(&req->kbuf->list, list); ··· 143 121 } 144 122 } 145 123 146 - static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) 124 + static inline void io_kbuf_drop(struct io_kiocb *req) 147 125 { 148 - unsigned int ret; 149 - 150 126 lockdep_assert_held(&req->ctx->completion_lock); 151 127 152 128 if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) 153 - return 0; 129 + return; 154 130 155 - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 156 131 __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); 157 - return ret; 158 132 } 159 133 160 - static inline unsigned int io_put_kbuf(struct io_kiocb *req, 161 - unsigned issue_flags) 134 + static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, 135 + unsigned issue_flags) 162 136 { 163 137 unsigned int ret; 164 138 ··· 163 145 164 146 ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 165 147 if (req->flags & REQ_F_BUFFER_RING) 166 - __io_put_kbuf_ring(req); 148 + __io_put_kbuf_ring(req, nbufs); 167 149 else 168 150 __io_put_kbuf(req, issue_flags); 169 151 return ret; 152 + } 153 + 154 + static inline unsigned int io_put_kbuf(struct io_kiocb *req, 155 + unsigned issue_flags) 156 + { 157 + return __io_put_kbufs(req, 1, issue_flags); 158 + } 159 + 160 + static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, 161 + unsigned issue_flags) 162 + { 163 + return __io_put_kbufs(req, nbufs, issue_flags); 170 164 } 171 165 #endif

+336

io_uring/memmap.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/kernel.h> 3 + #include <linux/init.h> 4 + #include <linux/errno.h> 5 + #include <linux/mm.h> 6 + #include <linux/mman.h> 7 + #include <linux/slab.h> 8 + #include <linux/vmalloc.h> 9 + #include <linux/io_uring.h> 10 + #include <linux/io_uring_types.h> 11 + #include <asm/shmparam.h> 12 + 13 + #include "memmap.h" 14 + #include "kbuf.h" 15 + 16 + static void *io_mem_alloc_compound(struct page **pages, int nr_pages, 17 + size_t size, gfp_t gfp) 18 + { 19 + struct page *page; 20 + int i, order; 21 + 22 + order = get_order(size); 23 + if (order > MAX_PAGE_ORDER) 24 + return ERR_PTR(-ENOMEM); 25 + else if (order) 26 + gfp |= __GFP_COMP; 27 + 28 + page = alloc_pages(gfp, order); 29 + if (!page) 30 + return ERR_PTR(-ENOMEM); 31 + 32 + for (i = 0; i < nr_pages; i++) 33 + pages[i] = page + i; 34 + 35 + return page_address(page); 36 + } 37 + 38 + static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, 39 + gfp_t gfp) 40 + { 41 + void *ret; 42 + int i; 43 + 44 + for (i = 0; i < nr_pages; i++) { 45 + pages[i] = alloc_page(gfp); 46 + if (!pages[i]) 47 + goto err; 48 + } 49 + 50 + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 51 + if (ret) 52 + return ret; 53 + err: 54 + while (i--) 55 + put_page(pages[i]); 56 + return ERR_PTR(-ENOMEM); 57 + } 58 + 59 + void *io_pages_map(struct page ***out_pages, unsigned short *npages, 60 + size_t size) 61 + { 62 + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 63 + struct page **pages; 64 + int nr_pages; 65 + void *ret; 66 + 67 + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 68 + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); 69 + if (!pages) 70 + return ERR_PTR(-ENOMEM); 71 + 72 + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); 73 + if (!IS_ERR(ret)) 74 + goto done; 75 + 76 + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); 77 + if (!IS_ERR(ret)) { 78 + done: 79 + *out_pages = pages; 80 + *npages = nr_pages; 81 + return ret; 82 + } 83 + 84 + kvfree(pages); 85 + *out_pages = NULL; 86 + *npages = 0; 87 + return ret; 88 + } 89 + 90 + void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, 91 + bool put_pages) 92 + { 93 + bool do_vunmap = false; 94 + 95 + if (!ptr) 96 + return; 97 + 98 + if (put_pages && *npages) { 99 + struct page **to_free = *pages; 100 + int i; 101 + 102 + /* 103 + * Only did vmap for the non-compound multiple page case. 104 + * For the compound page, we just need to put the head. 105 + */ 106 + if (PageCompound(to_free[0])) 107 + *npages = 1; 108 + else if (*npages > 1) 109 + do_vunmap = true; 110 + for (i = 0; i < *npages; i++) 111 + put_page(to_free[i]); 112 + } 113 + if (do_vunmap) 114 + vunmap(ptr); 115 + kvfree(*pages); 116 + *pages = NULL; 117 + *npages = 0; 118 + } 119 + 120 + void io_pages_free(struct page ***pages, int npages) 121 + { 122 + struct page **page_array = *pages; 123 + 124 + if (!page_array) 125 + return; 126 + 127 + unpin_user_pages(page_array, npages); 128 + kvfree(page_array); 129 + *pages = NULL; 130 + } 131 + 132 + struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) 133 + { 134 + unsigned long start, end, nr_pages; 135 + struct page **pages; 136 + int ret; 137 + 138 + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 139 + start = uaddr >> PAGE_SHIFT; 140 + nr_pages = end - start; 141 + if (WARN_ON_ONCE(!nr_pages)) 142 + return ERR_PTR(-EINVAL); 143 + 144 + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 145 + if (!pages) 146 + return ERR_PTR(-ENOMEM); 147 + 148 + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 149 + pages); 150 + /* success, mapped all pages */ 151 + if (ret == nr_pages) { 152 + *npages = nr_pages; 153 + return pages; 154 + } 155 + 156 + /* partial map, or didn't map anything */ 157 + if (ret >= 0) { 158 + /* if we did partial map, release any pages we did get */ 159 + if (ret) 160 + unpin_user_pages(pages, ret); 161 + ret = -EFAULT; 162 + } 163 + kvfree(pages); 164 + return ERR_PTR(ret); 165 + } 166 + 167 + void *__io_uaddr_map(struct page ***pages, unsigned short *npages, 168 + unsigned long uaddr, size_t size) 169 + { 170 + struct page **page_array; 171 + unsigned int nr_pages; 172 + void *page_addr; 173 + 174 + *npages = 0; 175 + 176 + if (uaddr & (PAGE_SIZE - 1) || !size) 177 + return ERR_PTR(-EINVAL); 178 + 179 + nr_pages = 0; 180 + page_array = io_pin_pages(uaddr, size, &nr_pages); 181 + if (IS_ERR(page_array)) 182 + return page_array; 183 + 184 + page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); 185 + if (page_addr) { 186 + *pages = page_array; 187 + *npages = nr_pages; 188 + return page_addr; 189 + } 190 + 191 + io_pages_free(&page_array, nr_pages); 192 + return ERR_PTR(-ENOMEM); 193 + } 194 + 195 + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, 196 + size_t sz) 197 + { 198 + struct io_ring_ctx *ctx = file->private_data; 199 + loff_t offset = pgoff << PAGE_SHIFT; 200 + 201 + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { 202 + case IORING_OFF_SQ_RING: 203 + case IORING_OFF_CQ_RING: 204 + /* Don't allow mmap if the ring was setup without it */ 205 + if (ctx->flags & IORING_SETUP_NO_MMAP) 206 + return ERR_PTR(-EINVAL); 207 + return ctx->rings; 208 + case IORING_OFF_SQES: 209 + /* Don't allow mmap if the ring was setup without it */ 210 + if (ctx->flags & IORING_SETUP_NO_MMAP) 211 + return ERR_PTR(-EINVAL); 212 + return ctx->sq_sqes; 213 + case IORING_OFF_PBUF_RING: { 214 + struct io_buffer_list *bl; 215 + unsigned int bgid; 216 + void *ptr; 217 + 218 + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 219 + bl = io_pbuf_get_bl(ctx, bgid); 220 + if (IS_ERR(bl)) 221 + return bl; 222 + ptr = bl->buf_ring; 223 + io_put_bl(ctx, bl); 224 + return ptr; 225 + } 226 + } 227 + 228 + return ERR_PTR(-EINVAL); 229 + } 230 + 231 + int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, 232 + struct page **pages, int npages) 233 + { 234 + unsigned long nr_pages = npages; 235 + 236 + vm_flags_set(vma, VM_DONTEXPAND); 237 + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); 238 + } 239 + 240 + #ifdef CONFIG_MMU 241 + 242 + __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 243 + { 244 + struct io_ring_ctx *ctx = file->private_data; 245 + size_t sz = vma->vm_end - vma->vm_start; 246 + long offset = vma->vm_pgoff << PAGE_SHIFT; 247 + void *ptr; 248 + 249 + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 250 + if (IS_ERR(ptr)) 251 + return PTR_ERR(ptr); 252 + 253 + switch (offset & IORING_OFF_MMAP_MASK) { 254 + case IORING_OFF_SQ_RING: 255 + case IORING_OFF_CQ_RING: 256 + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, 257 + ctx->n_ring_pages); 258 + case IORING_OFF_SQES: 259 + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, 260 + ctx->n_sqe_pages); 261 + case IORING_OFF_PBUF_RING: 262 + return io_pbuf_mmap(file, vma); 263 + } 264 + 265 + return -EINVAL; 266 + } 267 + 268 + unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, 269 + unsigned long len, unsigned long pgoff, 270 + unsigned long flags) 271 + { 272 + void *ptr; 273 + 274 + /* 275 + * Do not allow to map to user-provided address to avoid breaking the 276 + * aliasing rules. Userspace is not able to guess the offset address of 277 + * kernel kmalloc()ed memory area. 278 + */ 279 + if (addr) 280 + return -EINVAL; 281 + 282 + ptr = io_uring_validate_mmap_request(filp, pgoff, len); 283 + if (IS_ERR(ptr)) 284 + return -ENOMEM; 285 + 286 + /* 287 + * Some architectures have strong cache aliasing requirements. 288 + * For such architectures we need a coherent mapping which aliases 289 + * kernel memory *and* userspace memory. To achieve that: 290 + * - use a NULL file pointer to reference physical memory, and 291 + * - use the kernel virtual address of the shared io_uring context 292 + * (instead of the userspace-provided address, which has to be 0UL 293 + * anyway). 294 + * - use the same pgoff which the get_unmapped_area() uses to 295 + * calculate the page colouring. 296 + * For architectures without such aliasing requirements, the 297 + * architecture will return any suitable mapping because addr is 0. 298 + */ 299 + filp = NULL; 300 + flags |= MAP_SHARED; 301 + pgoff = 0; /* has been translated to ptr above */ 302 + #ifdef SHM_COLOUR 303 + addr = (uintptr_t) ptr; 304 + pgoff = addr >> PAGE_SHIFT; 305 + #else 306 + addr = 0UL; 307 + #endif 308 + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 309 + } 310 + 311 + #else /* !CONFIG_MMU */ 312 + 313 + int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 314 + { 315 + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; 316 + } 317 + 318 + unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 319 + { 320 + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 321 + } 322 + 323 + unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, 324 + unsigned long len, unsigned long pgoff, 325 + unsigned long flags) 326 + { 327 + void *ptr; 328 + 329 + ptr = io_uring_validate_mmap_request(file, pgoff, len); 330 + if (IS_ERR(ptr)) 331 + return PTR_ERR(ptr); 332 + 333 + return (unsigned long) ptr; 334 + } 335 + 336 + #endif /* !CONFIG_MMU */

+25

io_uring/memmap.h

··· 1 + #ifndef IO_URING_MEMMAP_H 2 + #define IO_URING_MEMMAP_H 3 + 4 + struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); 5 + void io_pages_free(struct page ***pages, int npages); 6 + int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, 7 + struct page **pages, int npages); 8 + 9 + void *io_pages_map(struct page ***out_pages, unsigned short *npages, 10 + size_t size); 11 + void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, 12 + bool put_pages); 13 + 14 + void *__io_uaddr_map(struct page ***pages, unsigned short *npages, 15 + unsigned long uaddr, size_t size); 16 + 17 + #ifndef CONFIG_MMU 18 + unsigned int io_uring_nommu_mmap_capabilities(struct file *file); 19 + #endif 20 + unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, 21 + unsigned long len, unsigned long pgoff, 22 + unsigned long flags); 23 + int io_uring_mmap(struct file *file, struct vm_area_struct *vma); 24 + 25 + #endif

+5 -7

io_uring/msg_ring.c

··· 83 83 return -EOWNERDEAD; 84 84 85 85 init_task_work(&msg->tw, func); 86 - if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL)) 86 + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) 87 87 return -EOWNERDEAD; 88 88 89 89 return IOU_ISSUE_SKIP_COMPLETE; ··· 147 147 if (target_ctx->flags & IORING_SETUP_IOPOLL) { 148 148 if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) 149 149 return -EAGAIN; 150 - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) 151 - ret = 0; 152 - io_double_unlock_ctx(target_ctx); 153 - } else { 154 - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) 155 - ret = 0; 156 150 } 151 + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) 152 + ret = 0; 153 + if (target_ctx->flags & IORING_SETUP_IOPOLL) 154 + io_double_unlock_ctx(target_ctx); 157 155 return ret; 158 156 } 159 157

+498 -362

io_uring/net.c

··· 28 28 struct sockaddr __user *addr; 29 29 int __user *addr_len; 30 30 int flags; 31 + int iou_flags; 31 32 u32 file_slot; 32 33 unsigned long nofile; 33 34 }; ··· 58 57 struct user_msghdr __user *umsg; 59 58 void __user *buf; 60 59 }; 61 - unsigned len; 60 + int len; 62 61 unsigned done_io; 63 62 unsigned msg_flags; 64 63 unsigned nr_multishot_loops; ··· 116 115 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 117 116 } 118 117 118 + static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 119 + { 120 + if (kmsg->free_iov) { 121 + kfree(kmsg->free_iov); 122 + kmsg->free_iov_nr = 0; 123 + kmsg->free_iov = NULL; 124 + } 125 + } 126 + 119 127 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 120 128 { 121 129 struct io_async_msghdr *hdr = req->async_data; 130 + struct iovec *iov; 122 131 123 - if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) 132 + /* can't recycle, ensure we free the iovec if we have one */ 133 + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 134 + io_netmsg_iovec_free(hdr); 124 135 return; 136 + } 125 137 126 138 /* Let normal cleanup path reap it if we fail adding to the cache */ 127 - if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { 139 + iov = hdr->free_iov; 140 + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 141 + if (iov) 142 + kasan_mempool_poison_object(iov); 128 143 req->async_data = NULL; 129 144 req->flags &= ~REQ_F_ASYNC_DATA; 130 145 } 131 146 } 132 147 133 - static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, 134 - unsigned int issue_flags) 148 + static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 135 149 { 136 150 struct io_ring_ctx *ctx = req->ctx; 137 - struct io_cache_entry *entry; 138 151 struct io_async_msghdr *hdr; 139 152 140 - if (!(issue_flags & IO_URING_F_UNLOCKED)) { 141 - entry = io_alloc_cache_get(&ctx->netmsg_cache); 142 - if (entry) { 143 - hdr = container_of(entry, struct io_async_msghdr, cache); 144 - hdr->free_iov = NULL; 145 - req->flags |= REQ_F_ASYNC_DATA; 146 - req->async_data = hdr; 147 - return hdr; 153 + hdr = io_alloc_cache_get(&ctx->netmsg_cache); 154 + if (hdr) { 155 + if (hdr->free_iov) { 156 + kasan_mempool_unpoison_object(hdr->free_iov, 157 + hdr->free_iov_nr * sizeof(struct iovec)); 158 + req->flags |= REQ_F_NEED_CLEANUP; 148 159 } 160 + req->flags |= REQ_F_ASYNC_DATA; 161 + req->async_data = hdr; 162 + return hdr; 149 163 } 150 164 151 165 if (!io_alloc_async_data(req)) { 152 166 hdr = req->async_data; 167 + hdr->free_iov_nr = 0; 153 168 hdr->free_iov = NULL; 154 169 return hdr; 155 170 } 156 171 return NULL; 157 172 } 158 173 159 - static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) 174 + /* assign new iovec to kmsg, if we need to */ 175 + static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, 176 + struct iovec *iov) 160 177 { 161 - /* ->prep_async is always called from the submission context */ 162 - return io_msg_alloc_async(req, 0); 178 + if (iov) { 179 + req->flags |= REQ_F_NEED_CLEANUP; 180 + kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; 181 + if (kmsg->free_iov) 182 + kfree(kmsg->free_iov); 183 + kmsg->free_iov = iov; 184 + } 185 + return 0; 163 186 } 164 187 165 - static int io_setup_async_msg(struct io_kiocb *req, 166 - struct io_async_msghdr *kmsg, 167 - unsigned int issue_flags) 188 + static inline void io_mshot_prep_retry(struct io_kiocb *req, 189 + struct io_async_msghdr *kmsg) 168 190 { 169 - struct io_async_msghdr *async_msg; 191 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 170 192 171 - if (req_has_async_data(req)) 172 - return -EAGAIN; 173 - async_msg = io_msg_alloc_async(req, issue_flags); 174 - if (!async_msg) { 175 - kfree(kmsg->free_iov); 176 - return -ENOMEM; 177 - } 178 - req->flags |= REQ_F_NEED_CLEANUP; 179 - memcpy(async_msg, kmsg, sizeof(*kmsg)); 180 - if (async_msg->msg.msg_name) 181 - async_msg->msg.msg_name = &async_msg->addr; 182 - 183 - if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) 184 - return -EAGAIN; 185 - 186 - /* if were using fast_iov, set it to the new one */ 187 - if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { 188 - size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; 189 - async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; 190 - } 191 - 192 - return -EAGAIN; 193 + req->flags &= ~REQ_F_BL_EMPTY; 194 + sr->done_io = 0; 195 + sr->len = 0; /* get from the provided buffer */ 196 + req->buf_index = sr->buf_group; 193 197 } 194 198 195 199 #ifdef CONFIG_COMPAT ··· 204 198 { 205 199 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 206 200 struct compat_iovec __user *uiov; 207 - int ret; 201 + struct iovec *iov; 202 + int ret, nr_segs; 203 + 204 + if (iomsg->free_iov) { 205 + nr_segs = iomsg->free_iov_nr; 206 + iov = iomsg->free_iov; 207 + } else { 208 + iov = &iomsg->fast_iov; 209 + nr_segs = 1; 210 + } 208 211 209 212 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 210 213 return -EFAULT; ··· 222 207 if (req->flags & REQ_F_BUFFER_SELECT) { 223 208 compat_ssize_t clen; 224 209 225 - iomsg->free_iov = NULL; 226 210 if (msg->msg_iovlen == 0) { 227 - sr->len = 0; 211 + sr->len = iov->iov_len = 0; 212 + iov->iov_base = NULL; 228 213 } else if (msg->msg_iovlen > 1) { 229 214 return -EINVAL; 230 215 } else { ··· 240 225 return 0; 241 226 } 242 227 243 - iomsg->free_iov = iomsg->fast_iov; 244 228 ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, 245 - UIO_FASTIOV, &iomsg->free_iov, 246 - &iomsg->msg.msg_iter, true); 229 + nr_segs, &iov, &iomsg->msg.msg_iter, true); 247 230 if (unlikely(ret < 0)) 248 231 return ret; 249 232 250 - return 0; 233 + return io_net_vec_assign(req, iomsg, iov); 251 234 } 252 235 #endif 253 236 ··· 253 240 struct user_msghdr *msg, int ddir) 254 241 { 255 242 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 256 - int ret; 243 + struct iovec *iov; 244 + int ret, nr_segs; 245 + 246 + if (iomsg->free_iov) { 247 + nr_segs = iomsg->free_iov_nr; 248 + iov = iomsg->free_iov; 249 + } else { 250 + iov = &iomsg->fast_iov; 251 + nr_segs = 1; 252 + } 257 253 258 254 if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) 259 255 return -EFAULT; ··· 278 256 279 257 if (req->flags & REQ_F_BUFFER_SELECT) { 280 258 if (msg->msg_iovlen == 0) { 281 - sr->len = iomsg->fast_iov[0].iov_len = 0; 282 - iomsg->fast_iov[0].iov_base = NULL; 283 - iomsg->free_iov = NULL; 259 + sr->len = iov->iov_len = 0; 260 + iov->iov_base = NULL; 284 261 } else if (msg->msg_iovlen > 1) { 285 262 ret = -EINVAL; 286 263 goto ua_end; ··· 287 266 /* we only need the length for provided buffers */ 288 267 if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) 289 268 goto ua_end; 290 - unsafe_get_user(iomsg->fast_iov[0].iov_len, 291 - &msg->msg_iov[0].iov_len, ua_end); 292 - sr->len = iomsg->fast_iov[0].iov_len; 293 - iomsg->free_iov = NULL; 269 + unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, 270 + ua_end); 271 + sr->len = iov->iov_len; 294 272 } 295 273 ret = 0; 296 274 ua_end: ··· 298 278 } 299 279 300 280 user_access_end(); 301 - iomsg->free_iov = iomsg->fast_iov; 302 - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, 303 - &iomsg->free_iov, &iomsg->msg.msg_iter, false); 281 + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, 282 + &iov, &iomsg->msg.msg_iter, false); 304 283 if (unlikely(ret < 0)) 305 284 return ret; 306 285 307 - return 0; 286 + return io_net_vec_assign(req, iomsg, iov); 308 287 } 309 288 310 289 static int io_sendmsg_copy_hdr(struct io_kiocb *req, ··· 339 320 return ret; 340 321 } 341 322 342 - int io_send_prep_async(struct io_kiocb *req) 323 + void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 343 324 { 344 - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 345 - struct io_async_msghdr *io; 346 - int ret; 325 + struct io_async_msghdr *io = req->async_data; 347 326 348 - if (req_has_async_data(req)) 349 - return 0; 350 - zc->done_io = 0; 351 - if (!zc->addr) 352 - return 0; 353 - io = io_msg_alloc_async_prep(req); 354 - if (!io) 355 - return -ENOMEM; 356 - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); 357 - return ret; 327 + io_netmsg_iovec_free(io); 358 328 } 359 329 360 - static int io_setup_async_addr(struct io_kiocb *req, 361 - struct sockaddr_storage *addr_storage, 362 - unsigned int issue_flags) 330 + static int io_send_setup(struct io_kiocb *req) 363 331 { 364 332 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 365 - struct io_async_msghdr *io; 366 - 367 - if (!sr->addr || req_has_async_data(req)) 368 - return -EAGAIN; 369 - io = io_msg_alloc_async(req, issue_flags); 370 - if (!io) 371 - return -ENOMEM; 372 - memcpy(&io->addr, addr_storage, sizeof(io->addr)); 373 - return -EAGAIN; 374 - } 375 - 376 - int io_sendmsg_prep_async(struct io_kiocb *req) 377 - { 378 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 333 + struct io_async_msghdr *kmsg = req->async_data; 379 334 int ret; 380 335 381 - sr->done_io = 0; 382 - if (!io_msg_alloc_async_prep(req)) 336 + kmsg->msg.msg_name = NULL; 337 + kmsg->msg.msg_namelen = 0; 338 + kmsg->msg.msg_control = NULL; 339 + kmsg->msg.msg_controllen = 0; 340 + kmsg->msg.msg_ubuf = NULL; 341 + 342 + if (sr->addr) { 343 + ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); 344 + if (unlikely(ret < 0)) 345 + return ret; 346 + kmsg->msg.msg_name = &kmsg->addr; 347 + kmsg->msg.msg_namelen = sr->addr_len; 348 + } 349 + if (!io_do_buffer_select(req)) { 350 + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 351 + &kmsg->msg.msg_iter); 352 + if (unlikely(ret < 0)) 353 + return ret; 354 + } 355 + return 0; 356 + } 357 + 358 + static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) 359 + { 360 + struct io_async_msghdr *kmsg; 361 + int ret; 362 + 363 + kmsg = io_msg_alloc_async(req); 364 + if (unlikely(!kmsg)) 383 365 return -ENOMEM; 384 - ret = io_sendmsg_copy_hdr(req, req->async_data); 366 + if (!is_msg) 367 + return io_send_setup(req); 368 + ret = io_sendmsg_copy_hdr(req, kmsg); 385 369 if (!ret) 386 370 req->flags |= REQ_F_NEED_CLEANUP; 387 371 return ret; 388 372 } 389 373 390 - void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 391 - { 392 - struct io_async_msghdr *io = req->async_data; 393 - 394 - kfree(io->free_iov); 395 - } 374 + #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 396 375 397 376 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 398 377 { ··· 410 393 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 411 394 sr->len = READ_ONCE(sqe->len); 412 395 sr->flags = READ_ONCE(sqe->ioprio); 413 - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) 396 + if (sr->flags & ~SENDMSG_FLAGS) 414 397 return -EINVAL; 415 398 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 416 399 if (sr->msg_flags & MSG_DONTWAIT) 417 400 req->flags |= REQ_F_NOWAIT; 401 + if (sr->flags & IORING_RECVSEND_BUNDLE) { 402 + if (req->opcode == IORING_OP_SENDMSG) 403 + return -EINVAL; 404 + if (!(req->flags & REQ_F_BUFFER_SELECT)) 405 + return -EINVAL; 406 + sr->msg_flags |= MSG_WAITALL; 407 + sr->buf_group = req->buf_index; 408 + req->buf_list = NULL; 409 + } 410 + if (req->flags & REQ_F_BUFFER_SELECT && sr->len) 411 + return -EINVAL; 418 412 419 413 #ifdef CONFIG_COMPAT 420 414 if (req->ctx->compat) 421 415 sr->msg_flags |= MSG_CMSG_COMPAT; 422 416 #endif 423 - return 0; 417 + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); 424 418 } 425 419 426 420 static void io_req_msg_cleanup(struct io_kiocb *req, 427 - struct io_async_msghdr *kmsg, 428 421 unsigned int issue_flags) 429 422 { 430 423 req->flags &= ~REQ_F_NEED_CLEANUP; 431 - /* fast path, check for non-NULL to avoid function call */ 432 - if (kmsg->free_iov) 433 - kfree(kmsg->free_iov); 434 424 io_netmsg_recycle(req, issue_flags); 425 + } 426 + 427 + /* 428 + * For bundle completions, we need to figure out how many segments we consumed. 429 + * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 430 + * could be using an ITER_IOVEC. If the latter, then if we consumed all of 431 + * the segments, then it's a trivial questiont o answer. If we have residual 432 + * data in the iter, then loop the segments to figure out how much we 433 + * transferred. 434 + */ 435 + static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 436 + { 437 + struct iovec *iov; 438 + int nbufs; 439 + 440 + /* no data is always zero segments, and a ubuf is always 1 segment */ 441 + if (ret <= 0) 442 + return 0; 443 + if (iter_is_ubuf(&kmsg->msg.msg_iter)) 444 + return 1; 445 + 446 + iov = kmsg->free_iov; 447 + if (!iov) 448 + iov = &kmsg->fast_iov; 449 + 450 + /* if all data was transferred, it's basic pointer math */ 451 + if (!iov_iter_count(&kmsg->msg.msg_iter)) 452 + return iter_iov(&kmsg->msg.msg_iter) - iov; 453 + 454 + /* short transfer, count segments */ 455 + nbufs = 0; 456 + do { 457 + int this_len = min_t(int, iov[nbufs].iov_len, ret); 458 + 459 + nbufs++; 460 + ret -= this_len; 461 + } while (ret); 462 + 463 + return nbufs; 464 + } 465 + 466 + static inline bool io_send_finish(struct io_kiocb *req, int *ret, 467 + struct io_async_msghdr *kmsg, 468 + unsigned issue_flags) 469 + { 470 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 471 + bool bundle_finished = *ret <= 0; 472 + unsigned int cflags; 473 + 474 + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 475 + cflags = io_put_kbuf(req, issue_flags); 476 + goto finish; 477 + } 478 + 479 + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); 480 + 481 + if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 482 + goto finish; 483 + 484 + /* 485 + * Fill CQE for this receive and see if we should keep trying to 486 + * receive from this socket. 487 + */ 488 + if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 489 + io_mshot_prep_retry(req, kmsg); 490 + return false; 491 + } 492 + 493 + /* Otherwise stop bundle and use the current result. */ 494 + finish: 495 + io_req_set_res(req, *ret, cflags); 496 + *ret = IOU_OK; 497 + return true; 435 498 } 436 499 437 500 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 438 501 { 439 502 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 440 - struct io_async_msghdr iomsg, *kmsg; 503 + struct io_async_msghdr *kmsg = req->async_data; 441 504 struct socket *sock; 442 505 unsigned flags; 443 506 int min_ret = 0; ··· 527 430 if (unlikely(!sock)) 528 431 return -ENOTSOCK; 529 432 530 - if (req_has_async_data(req)) { 531 - kmsg = req->async_data; 532 - kmsg->msg.msg_control_user = sr->msg_control; 533 - } else { 534 - ret = io_sendmsg_copy_hdr(req, &iomsg); 535 - if (ret) 536 - return ret; 537 - kmsg = &iomsg; 538 - } 539 - 540 433 if (!(req->flags & REQ_F_POLLED) && 541 434 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 542 - return io_setup_async_msg(req, kmsg, issue_flags); 435 + return -EAGAIN; 543 436 544 437 flags = sr->msg_flags; 545 438 if (issue_flags & IO_URING_F_NONBLOCK) ··· 537 450 if (flags & MSG_WAITALL) 538 451 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 539 452 453 + kmsg->msg.msg_control_user = sr->msg_control; 454 + 540 455 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 541 456 542 457 if (ret < min_ret) { 543 458 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 544 - return io_setup_async_msg(req, kmsg, issue_flags); 459 + return -EAGAIN; 545 460 if (ret > 0 && io_net_retry(sock, flags)) { 546 461 kmsg->msg.msg_controllen = 0; 547 462 kmsg->msg.msg_control = NULL; 548 463 sr->done_io += ret; 549 464 req->flags |= REQ_F_BL_NO_RECYCLE; 550 - return io_setup_async_msg(req, kmsg, issue_flags); 465 + return -EAGAIN; 551 466 } 552 467 if (ret == -ERESTARTSYS) 553 468 ret = -EINTR; 554 469 req_set_fail(req); 555 470 } 556 - io_req_msg_cleanup(req, kmsg, issue_flags); 471 + io_req_msg_cleanup(req, issue_flags); 557 472 if (ret >= 0) 558 473 ret += sr->done_io; 559 474 else if (sr->done_io) ··· 566 477 567 478 int io_send(struct io_kiocb *req, unsigned int issue_flags) 568 479 { 569 - struct sockaddr_storage __address; 570 480 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 571 - struct msghdr msg; 481 + struct io_async_msghdr *kmsg = req->async_data; 572 482 struct socket *sock; 573 483 unsigned flags; 574 484 int min_ret = 0; 575 485 int ret; 576 486 577 - msg.msg_name = NULL; 578 - msg.msg_control = NULL; 579 - msg.msg_controllen = 0; 580 - msg.msg_namelen = 0; 581 - msg.msg_ubuf = NULL; 582 - 583 - if (sr->addr) { 584 - if (req_has_async_data(req)) { 585 - struct io_async_msghdr *io = req->async_data; 586 - 587 - msg.msg_name = &io->addr; 588 - } else { 589 - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); 590 - if (unlikely(ret < 0)) 591 - return ret; 592 - msg.msg_name = (struct sockaddr *)&__address; 593 - } 594 - msg.msg_namelen = sr->addr_len; 595 - } 596 - 597 - if (!(req->flags & REQ_F_POLLED) && 598 - (sr->flags & IORING_RECVSEND_POLL_FIRST)) 599 - return io_setup_async_addr(req, &__address, issue_flags); 600 - 601 487 sock = sock_from_file(req->file); 602 488 if (unlikely(!sock)) 603 489 return -ENOTSOCK; 604 490 605 - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); 606 - if (unlikely(ret)) 607 - return ret; 491 + if (!(req->flags & REQ_F_POLLED) && 492 + (sr->flags & IORING_RECVSEND_POLL_FIRST)) 493 + return -EAGAIN; 608 494 609 495 flags = sr->msg_flags; 610 496 if (issue_flags & IO_URING_F_NONBLOCK) 611 497 flags |= MSG_DONTWAIT; 612 - if (flags & MSG_WAITALL) 613 - min_ret = iov_iter_count(&msg.msg_iter); 498 + 499 + retry_bundle: 500 + if (io_do_buffer_select(req)) { 501 + struct buf_sel_arg arg = { 502 + .iovs = &kmsg->fast_iov, 503 + .max_len = INT_MAX, 504 + .nr_iovs = 1, 505 + .mode = KBUF_MODE_EXPAND, 506 + }; 507 + 508 + if (kmsg->free_iov) { 509 + arg.nr_iovs = kmsg->free_iov_nr; 510 + arg.iovs = kmsg->free_iov; 511 + arg.mode |= KBUF_MODE_FREE; 512 + } 513 + 514 + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 515 + arg.nr_iovs = 1; 516 + 517 + ret = io_buffers_select(req, &arg, issue_flags); 518 + if (unlikely(ret < 0)) 519 + return ret; 520 + 521 + sr->len = arg.out_len; 522 + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, 523 + arg.out_len); 524 + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 525 + kmsg->free_iov_nr = ret; 526 + kmsg->free_iov = arg.iovs; 527 + } 528 + } 529 + 530 + /* 531 + * If MSG_WAITALL is set, or this is a bundle send, then we need 532 + * the full amount. If just bundle is set, if we do a short send 533 + * then we complete the bundle sequence rather than continue on. 534 + */ 535 + if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 536 + min_ret = iov_iter_count(&kmsg->msg.msg_iter); 614 537 615 538 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 616 - msg.msg_flags = flags; 617 - ret = sock_sendmsg(sock, &msg); 539 + kmsg->msg.msg_flags = flags; 540 + ret = sock_sendmsg(sock, &kmsg->msg); 618 541 if (ret < min_ret) { 619 542 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 620 - return io_setup_async_addr(req, &__address, issue_flags); 543 + return -EAGAIN; 621 544 622 545 if (ret > 0 && io_net_retry(sock, flags)) { 623 546 sr->len -= ret; 624 547 sr->buf += ret; 625 548 sr->done_io += ret; 626 549 req->flags |= REQ_F_BL_NO_RECYCLE; 627 - return io_setup_async_addr(req, &__address, issue_flags); 550 + return -EAGAIN; 628 551 } 629 552 if (ret == -ERESTARTSYS) 630 553 ret = -EINTR; ··· 646 545 ret += sr->done_io; 647 546 else if (sr->done_io) 648 547 ret = sr->done_io; 649 - io_req_set_res(req, ret, 0); 650 - return IOU_OK; 548 + 549 + if (!io_send_finish(req, &ret, kmsg, issue_flags)) 550 + goto retry_bundle; 551 + 552 + io_req_msg_cleanup(req, issue_flags); 553 + return ret; 651 554 } 652 555 653 556 static int io_recvmsg_mshot_prep(struct io_kiocb *req, ··· 716 611 msg.msg_controllen); 717 612 } 718 613 719 - int io_recvmsg_prep_async(struct io_kiocb *req) 614 + static int io_recvmsg_prep_setup(struct io_kiocb *req) 720 615 { 721 616 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 722 - struct io_async_msghdr *iomsg; 617 + struct io_async_msghdr *kmsg; 723 618 int ret; 724 619 725 - sr->done_io = 0; 726 - if (!io_msg_alloc_async_prep(req)) 620 + kmsg = io_msg_alloc_async(req); 621 + if (unlikely(!kmsg)) 727 622 return -ENOMEM; 728 - iomsg = req->async_data; 729 - ret = io_recvmsg_copy_hdr(req, iomsg); 623 + 624 + if (req->opcode == IORING_OP_RECV) { 625 + kmsg->msg.msg_name = NULL; 626 + kmsg->msg.msg_namelen = 0; 627 + kmsg->msg.msg_control = NULL; 628 + kmsg->msg.msg_get_inq = 1; 629 + kmsg->msg.msg_controllen = 0; 630 + kmsg->msg.msg_iocb = NULL; 631 + kmsg->msg.msg_ubuf = NULL; 632 + 633 + if (!io_do_buffer_select(req)) { 634 + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 635 + &kmsg->msg.msg_iter); 636 + if (unlikely(ret)) 637 + return ret; 638 + } 639 + return 0; 640 + } 641 + 642 + ret = io_recvmsg_copy_hdr(req, kmsg); 730 643 if (!ret) 731 644 req->flags |= REQ_F_NEED_CLEANUP; 732 645 return ret; 733 646 } 734 647 735 - #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) 648 + #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 649 + IORING_RECVSEND_BUNDLE) 736 650 737 651 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 738 652 { ··· 765 641 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 766 642 sr->len = READ_ONCE(sqe->len); 767 643 sr->flags = READ_ONCE(sqe->ioprio); 768 - if (sr->flags & ~(RECVMSG_FLAGS)) 644 + if (sr->flags & ~RECVMSG_FLAGS) 769 645 return -EINVAL; 770 646 sr->msg_flags = READ_ONCE(sqe->msg_flags); 771 647 if (sr->msg_flags & MSG_DONTWAIT) 772 648 req->flags |= REQ_F_NOWAIT; 773 649 if (sr->msg_flags & MSG_ERRQUEUE) 774 650 req->flags |= REQ_F_CLEAR_POLLIN; 775 - if (sr->flags & IORING_RECV_MULTISHOT) { 776 - if (!(req->flags & REQ_F_BUFFER_SELECT)) 777 - return -EINVAL; 778 - if (sr->msg_flags & MSG_WAITALL) 779 - return -EINVAL; 780 - if (req->opcode == IORING_OP_RECV && sr->len) 781 - return -EINVAL; 782 - req->flags |= REQ_F_APOLL_MULTISHOT; 651 + if (req->flags & REQ_F_BUFFER_SELECT) { 783 652 /* 784 653 * Store the buffer group for this multishot receive separately, 785 654 * as if we end up doing an io-wq based issue that selects a ··· 782 665 * restore it. 783 666 */ 784 667 sr->buf_group = req->buf_index; 668 + req->buf_list = NULL; 669 + } 670 + if (sr->flags & IORING_RECV_MULTISHOT) { 671 + if (!(req->flags & REQ_F_BUFFER_SELECT)) 672 + return -EINVAL; 673 + if (sr->msg_flags & MSG_WAITALL) 674 + return -EINVAL; 675 + if (req->opcode == IORING_OP_RECV && sr->len) 676 + return -EINVAL; 677 + req->flags |= REQ_F_APOLL_MULTISHOT; 678 + } 679 + if (sr->flags & IORING_RECVSEND_BUNDLE) { 680 + if (req->opcode == IORING_OP_RECVMSG) 681 + return -EINVAL; 785 682 } 786 683 787 684 #ifdef CONFIG_COMPAT ··· 803 672 sr->msg_flags |= MSG_CMSG_COMPAT; 804 673 #endif 805 674 sr->nr_multishot_loops = 0; 806 - return 0; 807 - } 808 - 809 - static inline void io_recv_prep_retry(struct io_kiocb *req) 810 - { 811 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 812 - 813 - req->flags &= ~REQ_F_BL_EMPTY; 814 - sr->done_io = 0; 815 - sr->len = 0; /* get from the provided buffer */ 816 - req->buf_index = sr->buf_group; 675 + return io_recvmsg_prep_setup(req); 817 676 } 818 677 819 678 /* ··· 813 692 * again (for multishot). 814 693 */ 815 694 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 816 - struct msghdr *msg, bool mshot_finished, 817 - unsigned issue_flags) 695 + struct io_async_msghdr *kmsg, 696 + bool mshot_finished, unsigned issue_flags) 818 697 { 698 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 819 699 unsigned int cflags; 820 700 821 - cflags = io_put_kbuf(req, issue_flags); 822 - if (msg->msg_inq > 0) 701 + if (sr->flags & IORING_RECVSEND_BUNDLE) 702 + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), 703 + issue_flags); 704 + else 705 + cflags = io_put_kbuf(req, issue_flags); 706 + 707 + if (kmsg->msg.msg_inq > 0) 823 708 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 709 + 710 + /* bundle with no more immediate buffers, we're done */ 711 + if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) 712 + goto finish; 824 713 825 714 /* 826 715 * Fill CQE for this receive and see if we should keep trying to 827 716 * receive from this socket. 828 717 */ 829 718 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 830 - io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 831 - *ret, cflags | IORING_CQE_F_MORE)) { 832 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 719 + io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 833 720 int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; 834 721 835 - io_recv_prep_retry(req); 722 + io_mshot_prep_retry(req, kmsg); 836 723 /* Known not-empty or unknown state, retry */ 837 - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { 724 + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 838 725 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 839 726 return false; 840 727 /* mshot retries exceeded, force a requeue */ ··· 857 728 } 858 729 859 730 /* Finish the request / stop multishot. */ 731 + finish: 860 732 io_req_set_res(req, *ret, cflags); 861 733 862 734 if (issue_flags & IO_URING_F_MULTISHOT) 863 735 *ret = IOU_STOP_MULTISHOT; 864 736 else 865 737 *ret = IOU_OK; 738 + io_req_msg_cleanup(req, issue_flags); 866 739 return true; 867 740 } 868 741 ··· 955 824 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 956 825 { 957 826 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 958 - struct io_async_msghdr iomsg, *kmsg; 827 + struct io_async_msghdr *kmsg = req->async_data; 959 828 struct socket *sock; 960 829 unsigned flags; 961 830 int ret, min_ret = 0; ··· 966 835 if (unlikely(!sock)) 967 836 return -ENOTSOCK; 968 837 969 - if (req_has_async_data(req)) { 970 - kmsg = req->async_data; 971 - } else { 972 - ret = io_recvmsg_copy_hdr(req, &iomsg); 973 - if (ret) 974 - return ret; 975 - kmsg = &iomsg; 976 - } 977 - 978 838 if (!(req->flags & REQ_F_POLLED) && 979 839 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 980 - return io_setup_async_msg(req, kmsg, issue_flags); 840 + return -EAGAIN; 981 841 982 842 flags = sr->msg_flags; 983 843 if (force_nonblock) ··· 1010 888 1011 889 if (ret < min_ret) { 1012 890 if (ret == -EAGAIN && force_nonblock) { 1013 - ret = io_setup_async_msg(req, kmsg, issue_flags); 1014 - if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { 891 + if (issue_flags & IO_URING_F_MULTISHOT) { 1015 892 io_kbuf_recycle(req, issue_flags); 1016 893 return IOU_ISSUE_SKIP_COMPLETE; 1017 894 } 1018 - return ret; 895 + return -EAGAIN; 1019 896 } 1020 897 if (ret > 0 && io_net_retry(sock, flags)) { 1021 898 sr->done_io += ret; 1022 899 req->flags |= REQ_F_BL_NO_RECYCLE; 1023 - return io_setup_async_msg(req, kmsg, issue_flags); 900 + return -EAGAIN; 1024 901 } 1025 902 if (ret == -ERESTARTSYS) 1026 903 ret = -EINTR; ··· 1035 914 else 1036 915 io_kbuf_recycle(req, issue_flags); 1037 916 1038 - if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) 917 + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1039 918 goto retry_multishot; 1040 919 1041 - if (mshot_finished) 1042 - io_req_msg_cleanup(req, kmsg, issue_flags); 1043 - else if (ret == -EAGAIN) 1044 - return io_setup_async_msg(req, kmsg, issue_flags); 1045 - 1046 920 return ret; 921 + } 922 + 923 + static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 924 + size_t *len, unsigned int issue_flags) 925 + { 926 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 927 + int ret; 928 + 929 + /* 930 + * If the ring isn't locked, then don't use the peek interface 931 + * to grab multiple buffers as we will lock/unlock between 932 + * this selection and posting the buffers. 933 + */ 934 + if (!(issue_flags & IO_URING_F_UNLOCKED) && 935 + sr->flags & IORING_RECVSEND_BUNDLE) { 936 + struct buf_sel_arg arg = { 937 + .iovs = &kmsg->fast_iov, 938 + .nr_iovs = 1, 939 + .mode = KBUF_MODE_EXPAND, 940 + }; 941 + 942 + if (kmsg->free_iov) { 943 + arg.nr_iovs = kmsg->free_iov_nr; 944 + arg.iovs = kmsg->free_iov; 945 + arg.mode |= KBUF_MODE_FREE; 946 + } 947 + 948 + if (kmsg->msg.msg_inq > 0) 949 + arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 950 + 951 + ret = io_buffers_peek(req, &arg); 952 + if (unlikely(ret < 0)) 953 + return ret; 954 + 955 + /* special case 1 vec, can be a fast path */ 956 + if (ret == 1) { 957 + sr->buf = arg.iovs[0].iov_base; 958 + sr->len = arg.iovs[0].iov_len; 959 + goto map_ubuf; 960 + } 961 + iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 962 + arg.out_len); 963 + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 964 + kmsg->free_iov_nr = ret; 965 + kmsg->free_iov = arg.iovs; 966 + } 967 + } else { 968 + void __user *buf; 969 + 970 + *len = sr->len; 971 + buf = io_buffer_select(req, len, issue_flags); 972 + if (!buf) 973 + return -ENOBUFS; 974 + sr->buf = buf; 975 + sr->len = *len; 976 + map_ubuf: 977 + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 978 + &kmsg->msg.msg_iter); 979 + if (unlikely(ret)) 980 + return ret; 981 + } 982 + 983 + return 0; 1047 984 } 1048 985 1049 986 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1050 987 { 1051 988 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1052 - struct msghdr msg; 989 + struct io_async_msghdr *kmsg = req->async_data; 1053 990 struct socket *sock; 1054 991 unsigned flags; 1055 992 int ret, min_ret = 0; ··· 1122 943 if (unlikely(!sock)) 1123 944 return -ENOTSOCK; 1124 945 1125 - msg.msg_name = NULL; 1126 - msg.msg_namelen = 0; 1127 - msg.msg_control = NULL; 1128 - msg.msg_get_inq = 1; 1129 - msg.msg_controllen = 0; 1130 - msg.msg_iocb = NULL; 1131 - msg.msg_ubuf = NULL; 1132 - 1133 946 flags = sr->msg_flags; 1134 947 if (force_nonblock) 1135 948 flags |= MSG_DONTWAIT; 1136 949 1137 950 retry_multishot: 1138 951 if (io_do_buffer_select(req)) { 1139 - void __user *buf; 1140 - 1141 - buf = io_buffer_select(req, &len, issue_flags); 1142 - if (!buf) 1143 - return -ENOBUFS; 1144 - sr->buf = buf; 1145 - sr->len = len; 952 + ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 953 + if (unlikely(ret)) 954 + goto out_free; 955 + sr->buf = NULL; 1146 956 } 1147 957 1148 - ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); 1149 - if (unlikely(ret)) 1150 - goto out_free; 1151 - 1152 - msg.msg_inq = -1; 1153 - msg.msg_flags = 0; 958 + kmsg->msg.msg_inq = -1; 959 + kmsg->msg.msg_flags = 0; 1154 960 1155 961 if (flags & MSG_WAITALL) 1156 - min_ret = iov_iter_count(&msg.msg_iter); 962 + min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1157 963 1158 - ret = sock_recvmsg(sock, &msg, flags); 964 + ret = sock_recvmsg(sock, &kmsg->msg, flags); 1159 965 if (ret < min_ret) { 1160 966 if (ret == -EAGAIN && force_nonblock) { 1161 967 if (issue_flags & IO_URING_F_MULTISHOT) { ··· 1160 996 if (ret == -ERESTARTSYS) 1161 997 ret = -EINTR; 1162 998 req_set_fail(req); 1163 - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 999 + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1164 1000 out_free: 1165 1001 req_set_fail(req); 1166 1002 } ··· 1172 1008 else 1173 1009 io_kbuf_recycle(req, issue_flags); 1174 1010 1175 - if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) 1011 + if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) 1176 1012 goto retry_multishot; 1177 1013 1178 1014 return ret; ··· 1181 1017 void io_send_zc_cleanup(struct io_kiocb *req) 1182 1018 { 1183 1019 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1184 - struct io_async_msghdr *io; 1020 + struct io_async_msghdr *io = req->async_data; 1185 1021 1186 - if (req_has_async_data(req)) { 1187 - io = req->async_data; 1188 - /* might be ->fast_iov if *msg_copy_hdr failed */ 1189 - if (io->free_iov != io->fast_iov) 1190 - kfree(io->free_iov); 1191 - } 1022 + if (req_has_async_data(req)) 1023 + io_netmsg_iovec_free(io); 1192 1024 if (zc->notif) { 1193 1025 io_notif_flush(zc->notif); 1194 1026 zc->notif = NULL; ··· 1201 1041 struct io_kiocb *notif; 1202 1042 1203 1043 zc->done_io = 0; 1044 + req->flags |= REQ_F_POLL_NO_LAZY; 1204 1045 1205 1046 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1206 1047 return -EINVAL; ··· 1222 1061 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1223 1062 return -EINVAL; 1224 1063 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1225 - io_notif_set_extended(notif); 1226 - io_notif_to_data(notif)->zc_report = true; 1064 + struct io_notif_data *nd = io_notif_to_data(notif); 1065 + 1066 + nd->zc_report = true; 1067 + nd->zc_used = false; 1068 + nd->zc_copied = false; 1227 1069 } 1228 1070 } 1229 1071 ··· 1254 1090 1255 1091 zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1256 1092 zc->len = READ_ONCE(sqe->len); 1257 - zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 1093 + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1258 1094 if (zc->msg_flags & MSG_DONTWAIT) 1259 1095 req->flags |= REQ_F_NOWAIT; 1260 1096 ··· 1262 1098 if (req->ctx->compat) 1263 1099 zc->msg_flags |= MSG_CMSG_COMPAT; 1264 1100 #endif 1265 - return 0; 1101 + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); 1266 1102 } 1267 1103 1268 1104 static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, ··· 1323 1159 return ret; 1324 1160 } 1325 1161 1162 + static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) 1163 + { 1164 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1165 + int ret; 1166 + 1167 + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 1168 + ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, 1169 + (u64)(uintptr_t)sr->buf, sr->len); 1170 + if (unlikely(ret)) 1171 + return ret; 1172 + kmsg->msg.sg_from_iter = io_sg_from_iter; 1173 + } else { 1174 + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 1175 + if (unlikely(ret)) 1176 + return ret; 1177 + ret = io_notif_account_mem(sr->notif, sr->len); 1178 + if (unlikely(ret)) 1179 + return ret; 1180 + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1181 + } 1182 + 1183 + return ret; 1184 + } 1185 + 1326 1186 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1327 1187 { 1328 - struct sockaddr_storage __address; 1329 1188 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1330 - struct msghdr msg; 1189 + struct io_async_msghdr *kmsg = req->async_data; 1331 1190 struct socket *sock; 1332 1191 unsigned msg_flags; 1333 1192 int ret, min_ret = 0; ··· 1361 1174 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1362 1175 return -EOPNOTSUPP; 1363 1176 1364 - msg.msg_name = NULL; 1365 - msg.msg_control = NULL; 1366 - msg.msg_controllen = 0; 1367 - msg.msg_namelen = 0; 1368 - 1369 - if (zc->addr) { 1370 - if (req_has_async_data(req)) { 1371 - struct io_async_msghdr *io = req->async_data; 1372 - 1373 - msg.msg_name = &io->addr; 1374 - } else { 1375 - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); 1376 - if (unlikely(ret < 0)) 1377 - return ret; 1378 - msg.msg_name = (struct sockaddr *)&__address; 1379 - } 1380 - msg.msg_namelen = zc->addr_len; 1381 - } 1382 - 1383 1177 if (!(req->flags & REQ_F_POLLED) && 1384 1178 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1385 - return io_setup_async_addr(req, &__address, issue_flags); 1179 + return -EAGAIN; 1386 1180 1387 - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { 1388 - ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, 1389 - (u64)(uintptr_t)zc->buf, zc->len); 1181 + if (!zc->done_io) { 1182 + ret = io_send_zc_import(req, kmsg); 1390 1183 if (unlikely(ret)) 1391 1184 return ret; 1392 - msg.sg_from_iter = io_sg_from_iter; 1393 - } else { 1394 - io_notif_set_extended(zc->notif); 1395 - ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); 1396 - if (unlikely(ret)) 1397 - return ret; 1398 - ret = io_notif_account_mem(zc->notif, zc->len); 1399 - if (unlikely(ret)) 1400 - return ret; 1401 - msg.sg_from_iter = io_sg_from_iter_iovec; 1402 1185 } 1403 1186 1404 - msg_flags = zc->msg_flags | MSG_ZEROCOPY; 1187 + msg_flags = zc->msg_flags; 1405 1188 if (issue_flags & IO_URING_F_NONBLOCK) 1406 1189 msg_flags |= MSG_DONTWAIT; 1407 1190 if (msg_flags & MSG_WAITALL) 1408 - min_ret = iov_iter_count(&msg.msg_iter); 1191 + min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1409 1192 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1410 1193 1411 - msg.msg_flags = msg_flags; 1412 - msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1413 - ret = sock_sendmsg(sock, &msg); 1194 + kmsg->msg.msg_flags = msg_flags; 1195 + kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1196 + ret = sock_sendmsg(sock, &kmsg->msg); 1414 1197 1415 1198 if (unlikely(ret < min_ret)) { 1416 1199 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1417 - return io_setup_async_addr(req, &__address, issue_flags); 1200 + return -EAGAIN; 1418 1201 1419 - if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { 1202 + if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1420 1203 zc->len -= ret; 1421 1204 zc->buf += ret; 1422 1205 zc->done_io += ret; 1423 1206 req->flags |= REQ_F_BL_NO_RECYCLE; 1424 - return io_setup_async_addr(req, &__address, issue_flags); 1207 + return -EAGAIN; 1425 1208 } 1426 1209 if (ret == -ERESTARTSYS) 1427 1210 ret = -EINTR; ··· 1409 1252 */ 1410 1253 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1411 1254 io_notif_flush(zc->notif); 1412 - req->flags &= ~REQ_F_NEED_CLEANUP; 1255 + io_req_msg_cleanup(req, 0); 1413 1256 } 1414 1257 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1415 1258 return IOU_OK; ··· 1418 1261 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1419 1262 { 1420 1263 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1421 - struct io_async_msghdr iomsg, *kmsg; 1264 + struct io_async_msghdr *kmsg = req->async_data; 1422 1265 struct socket *sock; 1423 1266 unsigned flags; 1424 1267 int ret, min_ret = 0; 1425 - 1426 - io_notif_set_extended(sr->notif); 1427 1268 1428 1269 sock = sock_from_file(req->file); 1429 1270 if (unlikely(!sock)) ··· 1429 1274 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1430 1275 return -EOPNOTSUPP; 1431 1276 1432 - if (req_has_async_data(req)) { 1433 - kmsg = req->async_data; 1434 - kmsg->msg.msg_control_user = sr->msg_control; 1435 - } else { 1436 - ret = io_sendmsg_copy_hdr(req, &iomsg); 1437 - if (ret) 1438 - return ret; 1439 - kmsg = &iomsg; 1440 - } 1441 - 1442 1277 if (!(req->flags & REQ_F_POLLED) && 1443 1278 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1444 - return io_setup_async_msg(req, kmsg, issue_flags); 1279 + return -EAGAIN; 1445 1280 1446 - flags = sr->msg_flags | MSG_ZEROCOPY; 1281 + flags = sr->msg_flags; 1447 1282 if (issue_flags & IO_URING_F_NONBLOCK) 1448 1283 flags |= MSG_DONTWAIT; 1449 1284 if (flags & MSG_WAITALL) 1450 1285 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1451 1286 1287 + kmsg->msg.msg_control_user = sr->msg_control; 1452 1288 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1453 1289 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1454 1290 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1455 1291 1456 1292 if (unlikely(ret < min_ret)) { 1457 1293 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1458 - return io_setup_async_msg(req, kmsg, issue_flags); 1294 + return -EAGAIN; 1459 1295 1460 1296 if (ret > 0 && io_net_retry(sock, flags)) { 1461 1297 sr->done_io += ret; 1462 1298 req->flags |= REQ_F_BL_NO_RECYCLE; 1463 - return io_setup_async_msg(req, kmsg, issue_flags); 1299 + return -EAGAIN; 1464 1300 } 1465 1301 if (ret == -ERESTARTSYS) 1466 1302 ret = -EINTR; 1467 1303 req_set_fail(req); 1468 1304 } 1469 - /* fast path, check for non-NULL to avoid function call */ 1470 - if (kmsg->free_iov) { 1471 - kfree(kmsg->free_iov); 1472 - kmsg->free_iov = NULL; 1473 - } 1474 1305 1475 - io_netmsg_recycle(req, issue_flags); 1476 1306 if (ret >= 0) 1477 1307 ret += sr->done_io; 1478 1308 else if (sr->done_io) ··· 1469 1329 */ 1470 1330 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1471 1331 io_notif_flush(sr->notif); 1472 - req->flags &= ~REQ_F_NEED_CLEANUP; 1332 + io_req_msg_cleanup(req, 0); 1473 1333 } 1474 1334 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1475 1335 return IOU_OK; ··· 1487 1347 req->cqe.flags |= IORING_CQE_F_MORE; 1488 1348 } 1489 1349 1350 + #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1351 + IORING_ACCEPT_POLL_FIRST) 1352 + 1490 1353 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1491 1354 { 1492 1355 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1493 - unsigned flags; 1494 1356 1495 1357 if (sqe->len || sqe->buf_index) 1496 1358 return -EINVAL; ··· 1501 1359 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1502 1360 accept->flags = READ_ONCE(sqe->accept_flags); 1503 1361 accept->nofile = rlimit(RLIMIT_NOFILE); 1504 - flags = READ_ONCE(sqe->ioprio); 1505 - if (flags & ~IORING_ACCEPT_MULTISHOT) 1362 + accept->iou_flags = READ_ONCE(sqe->ioprio); 1363 + if (accept->iou_flags & ~ACCEPT_FLAGS) 1506 1364 return -EINVAL; 1507 1365 1508 1366 accept->file_slot = READ_ONCE(sqe->file_index); 1509 1367 if (accept->file_slot) { 1510 1368 if (accept->flags & SOCK_CLOEXEC) 1511 1369 return -EINVAL; 1512 - if (flags & IORING_ACCEPT_MULTISHOT && 1370 + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1513 1371 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1514 1372 return -EINVAL; 1515 1373 } ··· 1517 1375 return -EINVAL; 1518 1376 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1519 1377 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1520 - if (flags & IORING_ACCEPT_MULTISHOT) 1378 + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1521 1379 req->flags |= REQ_F_APOLL_MULTISHOT; 1380 + if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1381 + req->flags |= REQ_F_NOWAIT; 1522 1382 return 0; 1523 1383 } 1524 1384 ··· 1532 1388 bool fixed = !!accept->file_slot; 1533 1389 struct file *file; 1534 1390 int ret, fd; 1391 + 1392 + if (!(req->flags & REQ_F_POLLED) && 1393 + accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1394 + return -EAGAIN; 1535 1395 1536 1396 retry: 1537 1397 if (!fixed) { ··· 1549 1401 if (!fixed) 1550 1402 put_unused_fd(fd); 1551 1403 ret = PTR_ERR(file); 1552 - if (ret == -EAGAIN && force_nonblock) { 1404 + if (ret == -EAGAIN && force_nonblock && 1405 + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { 1553 1406 /* 1554 1407 * if it's multishot and polled, we don't need to 1555 1408 * return EAGAIN to arm the poll infra since it ··· 1578 1429 1579 1430 if (ret < 0) 1580 1431 return ret; 1581 - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, 1582 - ret, IORING_CQE_F_MORE)) 1432 + if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE)) 1583 1433 goto retry; 1584 1434 1585 1435 io_req_set_res(req, ret, 0); ··· 1639 1491 return IOU_OK; 1640 1492 } 1641 1493 1642 - int io_connect_prep_async(struct io_kiocb *req) 1643 - { 1644 - struct io_async_connect *io = req->async_data; 1645 - struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1646 - 1647 - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 1648 - } 1649 - 1650 1494 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1651 1495 { 1652 1496 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1497 + struct io_async_msghdr *io; 1653 1498 1654 1499 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1655 1500 return -EINVAL; ··· 1650 1509 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1651 1510 conn->addr_len = READ_ONCE(sqe->addr2); 1652 1511 conn->in_progress = conn->seen_econnaborted = false; 1653 - return 0; 1512 + 1513 + io = io_msg_alloc_async(req); 1514 + if (unlikely(!io)) 1515 + return -ENOMEM; 1516 + 1517 + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1654 1518 } 1655 1519 1656 1520 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1657 1521 { 1658 1522 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1659 - struct io_async_connect __io, *io; 1523 + struct io_async_msghdr *io = req->async_data; 1660 1524 unsigned file_flags; 1661 1525 int ret; 1662 1526 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1663 1527 1664 - if (req_has_async_data(req)) { 1665 - io = req->async_data; 1666 - } else { 1667 - ret = move_addr_to_kernel(connect->addr, 1668 - connect->addr_len, 1669 - &__io.address); 1670 - if (ret) 1671 - goto out; 1672 - io = &__io; 1673 - } 1674 - 1675 1528 file_flags = force_nonblock ? O_NONBLOCK : 0; 1676 1529 1677 - ret = __sys_connect_file(req->file, &io->address, 1678 - connect->addr_len, file_flags); 1530 + ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1531 + file_flags); 1679 1532 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1680 1533 && force_nonblock) { 1681 1534 if (ret == -EINPROGRESS) { ··· 1679 1544 goto out; 1680 1545 connect->seen_econnaborted = true; 1681 1546 } 1682 - if (req_has_async_data(req)) 1683 - return -EAGAIN; 1684 - if (io_alloc_async_data(req)) { 1685 - ret = -ENOMEM; 1686 - goto out; 1687 - } 1688 - memcpy(req->async_data, &__io, sizeof(__io)); 1689 1547 return -EAGAIN; 1690 1548 } 1691 1549 if (connect->in_progress) { ··· 1696 1568 out: 1697 1569 if (ret < 0) 1698 1570 req_set_fail(req); 1571 + io_req_msg_cleanup(req, issue_flags); 1699 1572 io_req_set_res(req, ret, 0); 1700 1573 return IOU_OK; 1701 1574 } 1702 1575 1703 - void io_netmsg_cache_free(struct io_cache_entry *entry) 1576 + void io_netmsg_cache_free(const void *entry) 1704 1577 { 1705 - kfree(container_of(entry, struct io_async_msghdr, cache)); 1578 + struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1579 + 1580 + if (kmsg->free_iov) { 1581 + kasan_mempool_unpoison_object(kmsg->free_iov, 1582 + kmsg->free_iov_nr * sizeof(struct iovec)); 1583 + io_netmsg_iovec_free(kmsg); 1584 + } 1585 + kfree(kmsg); 1706 1586 } 1707 1587 #endif

+7 -22

io_uring/net.h

··· 3 3 #include <linux/net.h> 4 4 #include <linux/uio.h> 5 5 6 - #include "alloc_cache.h" 7 - 8 6 struct io_async_msghdr { 9 7 #if defined(CONFIG_NET) 10 - union { 11 - struct iovec fast_iov[UIO_FASTIOV]; 12 - struct { 13 - struct iovec fast_iov_one; 14 - __kernel_size_t controllen; 15 - int namelen; 16 - __kernel_size_t payloadlen; 17 - }; 18 - struct io_cache_entry cache; 19 - }; 8 + struct iovec fast_iov; 20 9 /* points to an allocated iov, if NULL we use fast_iov instead */ 21 10 struct iovec *free_iov; 11 + int free_iov_nr; 12 + int namelen; 13 + __kernel_size_t controllen; 14 + __kernel_size_t payloadlen; 22 15 struct sockaddr __user *uaddr; 23 16 struct msghdr msg; 24 17 struct sockaddr_storage addr; ··· 20 27 21 28 #if defined(CONFIG_NET) 22 29 23 - struct io_async_connect { 24 - struct sockaddr_storage address; 25 - }; 26 - 27 30 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 28 31 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); 29 32 30 - int io_sendmsg_prep_async(struct io_kiocb *req); 31 33 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); 32 34 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 33 35 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); 34 36 35 37 int io_send(struct io_kiocb *req, unsigned int issue_flags); 36 - int io_send_prep_async(struct io_kiocb *req); 37 38 38 - int io_recvmsg_prep_async(struct io_kiocb *req); 39 39 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 40 40 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); 41 41 int io_recv(struct io_kiocb *req, unsigned int issue_flags); ··· 41 55 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 42 56 int io_socket(struct io_kiocb *req, unsigned int issue_flags); 43 57 44 - int io_connect_prep_async(struct io_kiocb *req); 45 58 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 46 59 int io_connect(struct io_kiocb *req, unsigned int issue_flags); 47 60 ··· 49 64 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 50 65 void io_send_zc_cleanup(struct io_kiocb *req); 51 66 52 - void io_netmsg_cache_free(struct io_cache_entry *entry); 67 + void io_netmsg_cache_free(const void *entry); 53 68 #else 54 - static inline void io_netmsg_cache_free(struct io_cache_entry *entry) 69 + static inline void io_netmsg_cache_free(const void *entry) 55 70 { 56 71 } 57 72 #endif

+22 -4

io_uring/nop.c

··· 10 10 #include "io_uring.h" 11 11 #include "nop.h" 12 12 13 + struct io_nop { 14 + /* NOTE: kiocb has the file as the first member, so don't do it here */ 15 + struct file *file; 16 + int result; 17 + }; 18 + 13 19 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 14 20 { 21 + unsigned int flags; 22 + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); 23 + 24 + flags = READ_ONCE(sqe->nop_flags); 25 + if (flags & ~IORING_NOP_INJECT_RESULT) 26 + return -EINVAL; 27 + 28 + if (flags & IORING_NOP_INJECT_RESULT) 29 + nop->result = READ_ONCE(sqe->len); 30 + else 31 + nop->result = 0; 15 32 return 0; 16 33 } 17 34 18 - /* 19 - * IORING_OP_NOP just posts a completion event, nothing else. 20 - */ 21 35 int io_nop(struct io_kiocb *req, unsigned int issue_flags) 22 36 { 23 - io_req_set_res(req, 0, 0); 37 + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); 38 + 39 + if (nop->result < 0) 40 + req_set_fail(req); 41 + io_req_set_res(req, nop->result, 0); 24 42 return IOU_OK; 25 43 }

+80 -34

io_uring/notif.c

··· 9 9 #include "notif.h" 10 10 #include "rsrc.h" 11 11 12 - static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) 12 + static const struct ubuf_info_ops io_ubuf_ops; 13 + 14 + static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) 13 15 { 14 16 struct io_notif_data *nd = io_notif_to_data(notif); 15 - struct io_ring_ctx *ctx = notif->ctx; 16 17 17 - if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) 18 - notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; 18 + do { 19 + notif = cmd_to_io_kiocb(nd); 19 20 20 - if (nd->account_pages && ctx->user) { 21 - __io_unaccount_mem(ctx->user, nd->account_pages); 22 - nd->account_pages = 0; 23 - } 24 - io_req_task_complete(notif, ts); 21 + lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); 22 + 23 + if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) 24 + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; 25 + 26 + if (nd->account_pages && notif->ctx->user) { 27 + __io_unaccount_mem(notif->ctx->user, nd->account_pages); 28 + nd->account_pages = 0; 29 + } 30 + 31 + nd = nd->next; 32 + io_req_task_complete(notif, ts); 33 + } while (nd); 25 34 } 26 35 27 - static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, 28 - bool success) 36 + void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, 37 + bool success) 29 38 { 30 39 struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); 31 40 struct io_kiocb *notif = cmd_to_io_kiocb(nd); 32 - 33 - if (refcount_dec_and_test(&uarg->refcnt)) 34 - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); 35 - } 36 - 37 - static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, 38 - bool success) 39 - { 40 - struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); 41 + unsigned tw_flags; 41 42 42 43 if (nd->zc_report) { 43 44 if (success && !nd->zc_used && skb) ··· 46 45 else if (!success && !nd->zc_copied) 47 46 WRITE_ONCE(nd->zc_copied, true); 48 47 } 49 - io_tx_ubuf_callback(skb, uarg, success); 50 - } 51 48 52 - void io_notif_set_extended(struct io_kiocb *notif) 53 - { 54 - struct io_notif_data *nd = io_notif_to_data(notif); 49 + if (!refcount_dec_and_test(&uarg->refcnt)) 50 + return; 55 51 56 - if (nd->uarg.callback != io_tx_ubuf_callback_ext) { 57 - nd->account_pages = 0; 58 - nd->zc_report = false; 59 - nd->zc_used = false; 60 - nd->zc_copied = false; 61 - nd->uarg.callback = io_tx_ubuf_callback_ext; 62 - notif->io_task_work.func = io_notif_complete_tw_ext; 52 + if (nd->head != nd) { 53 + io_tx_ubuf_complete(skb, &nd->head->uarg, success); 54 + return; 63 55 } 56 + 57 + tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE; 58 + notif->io_task_work.func = io_notif_tw_complete; 59 + __io_req_task_work_add(notif, tw_flags); 64 60 } 61 + 62 + static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) 63 + { 64 + struct io_notif_data *nd, *prev_nd; 65 + struct io_kiocb *prev_notif, *notif; 66 + struct ubuf_info *prev_uarg = skb_zcopy(skb); 67 + 68 + nd = container_of(uarg, struct io_notif_data, uarg); 69 + notif = cmd_to_io_kiocb(nd); 70 + 71 + if (!prev_uarg) { 72 + net_zcopy_get(&nd->uarg); 73 + skb_zcopy_init(skb, &nd->uarg); 74 + return 0; 75 + } 76 + /* handle it separately as we can't link a notif to itself */ 77 + if (unlikely(prev_uarg == &nd->uarg)) 78 + return 0; 79 + /* we can't join two links together, just request a fresh skb */ 80 + if (unlikely(nd->head != nd || nd->next)) 81 + return -EEXIST; 82 + /* don't mix zc providers */ 83 + if (unlikely(prev_uarg->ops != &io_ubuf_ops)) 84 + return -EEXIST; 85 + 86 + prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); 87 + prev_notif = cmd_to_io_kiocb(nd); 88 + 89 + /* make sure all noifications can be finished in the same task_work */ 90 + if (unlikely(notif->ctx != prev_notif->ctx || 91 + notif->task != prev_notif->task)) 92 + return -EEXIST; 93 + 94 + nd->head = prev_nd->head; 95 + nd->next = prev_nd->next; 96 + prev_nd->next = nd; 97 + net_zcopy_get(&nd->head->uarg); 98 + return 0; 99 + } 100 + 101 + static const struct ubuf_info_ops io_ubuf_ops = { 102 + .complete = io_tx_ubuf_complete, 103 + .link_skb = io_link_skb, 104 + }; 65 105 66 106 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) 67 107 __must_hold(&ctx->uring_lock) ··· 118 76 notif->task = current; 119 77 io_get_task_refs(1); 120 78 notif->rsrc_node = NULL; 121 - notif->io_task_work.func = io_req_task_complete; 122 79 123 80 nd = io_notif_to_data(notif); 81 + nd->zc_report = false; 82 + nd->account_pages = 0; 83 + nd->next = NULL; 84 + nd->head = nd; 85 + 124 86 nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; 125 - nd->uarg.callback = io_tx_ubuf_callback; 87 + nd->uarg.ops = &io_ubuf_ops; 126 88 refcount_set(&nd->uarg.refcnt, 1); 127 89 return notif; 128 90 }

+8 -5

io_uring/notif.h

··· 13 13 struct io_notif_data { 14 14 struct file *file; 15 15 struct ubuf_info uarg; 16 - unsigned long account_pages; 16 + 17 + struct io_notif_data *next; 18 + struct io_notif_data *head; 19 + 20 + unsigned account_pages; 17 21 bool zc_report; 18 22 bool zc_used; 19 23 bool zc_copied; 20 24 }; 21 25 22 26 struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); 23 - void io_notif_set_extended(struct io_kiocb *notif); 27 + void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, 28 + bool success); 24 29 25 30 static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) 26 31 { ··· 37 32 { 38 33 struct io_notif_data *nd = io_notif_to_data(notif); 39 34 40 - /* drop slot's master ref */ 41 - if (refcount_dec_and_test(&nd->uarg.refcnt)) 42 - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); 35 + io_tx_ubuf_complete(NULL, &nd->uarg, true); 43 36 } 44 37 45 38 static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)

+27 -38

io_uring/opdef.c

··· 67 67 .iopoll = 1, 68 68 .iopoll_queue = 1, 69 69 .vectored = 1, 70 - .prep = io_prep_rwv, 70 + .async_size = sizeof(struct io_async_rw), 71 + .prep = io_prep_readv, 71 72 .issue = io_read, 72 73 }, 73 74 [IORING_OP_WRITEV] = { ··· 82 81 .iopoll = 1, 83 82 .iopoll_queue = 1, 84 83 .vectored = 1, 85 - .prep = io_prep_rwv, 84 + .async_size = sizeof(struct io_async_rw), 85 + .prep = io_prep_writev, 86 86 .issue = io_write, 87 87 }, 88 88 [IORING_OP_FSYNC] = { ··· 101 99 .ioprio = 1, 102 100 .iopoll = 1, 103 101 .iopoll_queue = 1, 104 - .prep = io_prep_rw_fixed, 102 + .async_size = sizeof(struct io_async_rw), 103 + .prep = io_prep_read_fixed, 105 104 .issue = io_read, 106 105 }, 107 106 [IORING_OP_WRITE_FIXED] = { ··· 115 112 .ioprio = 1, 116 113 .iopoll = 1, 117 114 .iopoll_queue = 1, 118 - .prep = io_prep_rw_fixed, 115 + .async_size = sizeof(struct io_async_rw), 116 + .prep = io_prep_write_fixed, 119 117 .issue = io_write, 120 118 }, 121 119 [IORING_OP_POLL_ADD] = { ··· 142 138 .unbound_nonreg_file = 1, 143 139 .pollout = 1, 144 140 .ioprio = 1, 145 - .manual_alloc = 1, 146 141 #if defined(CONFIG_NET) 142 + .async_size = sizeof(struct io_async_msghdr), 147 143 .prep = io_sendmsg_prep, 148 144 .issue = io_sendmsg, 149 145 #else ··· 156 152 .pollin = 1, 157 153 .buffer_select = 1, 158 154 .ioprio = 1, 159 - .manual_alloc = 1, 160 155 #if defined(CONFIG_NET) 156 + .async_size = sizeof(struct io_async_msghdr), 161 157 .prep = io_recvmsg_prep, 162 158 .issue = io_recvmsg, 163 159 #else ··· 166 162 }, 167 163 [IORING_OP_TIMEOUT] = { 168 164 .audit_skip = 1, 165 + .async_size = sizeof(struct io_timeout_data), 169 166 .prep = io_timeout_prep, 170 167 .issue = io_timeout, 171 168 }, ··· 196 191 }, 197 192 [IORING_OP_LINK_TIMEOUT] = { 198 193 .audit_skip = 1, 194 + .async_size = sizeof(struct io_timeout_data), 199 195 .prep = io_link_timeout_prep, 200 196 .issue = io_no_issue, 201 197 }, ··· 205 199 .unbound_nonreg_file = 1, 206 200 .pollout = 1, 207 201 #if defined(CONFIG_NET) 202 + .async_size = sizeof(struct io_async_msghdr), 208 203 .prep = io_connect_prep, 209 204 .issue = io_connect, 210 205 #else ··· 246 239 .ioprio = 1, 247 240 .iopoll = 1, 248 241 .iopoll_queue = 1, 249 - .prep = io_prep_rw, 242 + .async_size = sizeof(struct io_async_rw), 243 + .prep = io_prep_read, 250 244 .issue = io_read, 251 245 }, 252 246 [IORING_OP_WRITE] = { ··· 260 252 .ioprio = 1, 261 253 .iopoll = 1, 262 254 .iopoll_queue = 1, 263 - .prep = io_prep_rw, 255 + .async_size = sizeof(struct io_async_rw), 256 + .prep = io_prep_write, 264 257 .issue = io_write, 265 258 }, 266 259 [IORING_OP_FADVISE] = { ··· 281 272 .pollout = 1, 282 273 .audit_skip = 1, 283 274 .ioprio = 1, 284 - .manual_alloc = 1, 275 + .buffer_select = 1, 285 276 #if defined(CONFIG_NET) 277 + .async_size = sizeof(struct io_async_msghdr), 286 278 .prep = io_sendmsg_prep, 287 279 .issue = io_send, 288 280 #else ··· 298 288 .audit_skip = 1, 299 289 .ioprio = 1, 300 290 #if defined(CONFIG_NET) 291 + .async_size = sizeof(struct io_async_msghdr), 301 292 .prep = io_recvmsg_prep, 302 293 .issue = io_recv, 303 294 #else ··· 414 403 .plug = 1, 415 404 .iopoll = 1, 416 405 .iopoll_queue = 1, 406 + .async_size = 2 * sizeof(struct io_uring_sqe), 417 407 .prep = io_uring_cmd_prep, 418 408 .issue = io_uring_cmd, 419 409 }, ··· 424 412 .pollout = 1, 425 413 .audit_skip = 1, 426 414 .ioprio = 1, 427 - .manual_alloc = 1, 428 415 #if defined(CONFIG_NET) 416 + .async_size = sizeof(struct io_async_msghdr), 429 417 .prep = io_send_zc_prep, 430 418 .issue = io_send_zc, 431 419 #else ··· 437 425 .unbound_nonreg_file = 1, 438 426 .pollout = 1, 439 427 .ioprio = 1, 440 - .manual_alloc = 1, 441 428 #if defined(CONFIG_NET) 429 + .async_size = sizeof(struct io_async_msghdr), 442 430 .prep = io_send_zc_prep, 443 431 .issue = io_sendmsg_zc, 444 432 #else ··· 451 439 .pollin = 1, 452 440 .buffer_select = 1, 453 441 .audit_skip = 1, 442 + .async_size = sizeof(struct io_async_rw), 454 443 .prep = io_read_mshot_prep, 455 444 .issue = io_read_mshot, 456 445 }, 457 446 [IORING_OP_WAITID] = { 447 + .async_size = sizeof(struct io_waitid_async), 458 448 .prep = io_waitid_prep, 459 449 .issue = io_waitid, 460 450 }, ··· 502 488 .name = "NOP", 503 489 }, 504 490 [IORING_OP_READV] = { 505 - .async_size = sizeof(struct io_async_rw), 506 491 .name = "READV", 507 - .prep_async = io_readv_prep_async, 508 492 .cleanup = io_readv_writev_cleanup, 509 493 .fail = io_rw_fail, 510 494 }, 511 495 [IORING_OP_WRITEV] = { 512 - .async_size = sizeof(struct io_async_rw), 513 496 .name = "WRITEV", 514 - .prep_async = io_writev_prep_async, 515 497 .cleanup = io_readv_writev_cleanup, 516 498 .fail = io_rw_fail, 517 499 }, ··· 515 505 .name = "FSYNC", 516 506 }, 517 507 [IORING_OP_READ_FIXED] = { 518 - .async_size = sizeof(struct io_async_rw), 519 508 .name = "READ_FIXED", 520 509 .fail = io_rw_fail, 521 510 }, 522 511 [IORING_OP_WRITE_FIXED] = { 523 - .async_size = sizeof(struct io_async_rw), 524 512 .name = "WRITE_FIXED", 525 513 .fail = io_rw_fail, 526 514 }, ··· 534 526 [IORING_OP_SENDMSG] = { 535 527 .name = "SENDMSG", 536 528 #if defined(CONFIG_NET) 537 - .async_size = sizeof(struct io_async_msghdr), 538 - .prep_async = io_sendmsg_prep_async, 539 529 .cleanup = io_sendmsg_recvmsg_cleanup, 540 530 .fail = io_sendrecv_fail, 541 531 #endif ··· 541 535 [IORING_OP_RECVMSG] = { 542 536 .name = "RECVMSG", 543 537 #if defined(CONFIG_NET) 544 - .async_size = sizeof(struct io_async_msghdr), 545 - .prep_async = io_recvmsg_prep_async, 546 538 .cleanup = io_sendmsg_recvmsg_cleanup, 547 539 .fail = io_sendrecv_fail, 548 540 #endif 549 541 }, 550 542 [IORING_OP_TIMEOUT] = { 551 - .async_size = sizeof(struct io_timeout_data), 552 543 .name = "TIMEOUT", 553 544 }, 554 545 [IORING_OP_TIMEOUT_REMOVE] = { ··· 558 555 .name = "ASYNC_CANCEL", 559 556 }, 560 557 [IORING_OP_LINK_TIMEOUT] = { 561 - .async_size = sizeof(struct io_timeout_data), 562 558 .name = "LINK_TIMEOUT", 563 559 }, 564 560 [IORING_OP_CONNECT] = { 565 561 .name = "CONNECT", 566 - #if defined(CONFIG_NET) 567 - .async_size = sizeof(struct io_async_connect), 568 - .prep_async = io_connect_prep_async, 569 - #endif 570 562 }, 571 563 [IORING_OP_FALLOCATE] = { 572 564 .name = "FALLOCATE", ··· 581 583 .cleanup = io_statx_cleanup, 582 584 }, 583 585 [IORING_OP_READ] = { 584 - .async_size = sizeof(struct io_async_rw), 585 586 .name = "READ", 586 587 .fail = io_rw_fail, 587 588 }, 588 589 [IORING_OP_WRITE] = { 589 - .async_size = sizeof(struct io_async_rw), 590 590 .name = "WRITE", 591 591 .fail = io_rw_fail, 592 592 }, ··· 597 601 [IORING_OP_SEND] = { 598 602 .name = "SEND", 599 603 #if defined(CONFIG_NET) 600 - .async_size = sizeof(struct io_async_msghdr), 604 + .cleanup = io_sendmsg_recvmsg_cleanup, 601 605 .fail = io_sendrecv_fail, 602 - .prep_async = io_send_prep_async, 603 606 #endif 604 607 }, 605 608 [IORING_OP_RECV] = { 606 609 .name = "RECV", 607 610 #if defined(CONFIG_NET) 611 + .cleanup = io_sendmsg_recvmsg_cleanup, 608 612 .fail = io_sendrecv_fail, 609 613 #endif 610 614 }, ··· 675 679 }, 676 680 [IORING_OP_URING_CMD] = { 677 681 .name = "URING_CMD", 678 - .async_size = 2 * sizeof(struct io_uring_sqe), 679 - .prep_async = io_uring_cmd_prep_async, 680 682 }, 681 683 [IORING_OP_SEND_ZC] = { 682 684 .name = "SEND_ZC", 683 685 #if defined(CONFIG_NET) 684 - .async_size = sizeof(struct io_async_msghdr), 685 - .prep_async = io_send_prep_async, 686 686 .cleanup = io_send_zc_cleanup, 687 687 .fail = io_sendrecv_fail, 688 688 #endif ··· 686 694 [IORING_OP_SENDMSG_ZC] = { 687 695 .name = "SENDMSG_ZC", 688 696 #if defined(CONFIG_NET) 689 - .async_size = sizeof(struct io_async_msghdr), 690 - .prep_async = io_sendmsg_prep_async, 691 697 .cleanup = io_send_zc_cleanup, 692 698 .fail = io_sendrecv_fail, 693 699 #endif ··· 695 705 }, 696 706 [IORING_OP_WAITID] = { 697 707 .name = "WAITID", 698 - .async_size = sizeof(struct io_waitid_async), 699 708 }, 700 709 [IORING_OP_FUTEX_WAIT] = { 701 710 .name = "FUTEX_WAIT",

+3 -6

io_uring/opdef.h

··· 27 27 unsigned iopoll : 1; 28 28 /* have to be put into the iopoll list */ 29 29 unsigned iopoll_queue : 1; 30 - /* opcode specific path will handle ->async_data allocation if needed */ 31 - unsigned manual_alloc : 1; 32 30 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ 33 31 unsigned vectored : 1; 32 + 33 + /* size of async data needed, if any */ 34 + unsigned short async_size; 34 35 35 36 int (*issue)(struct io_kiocb *, unsigned int); 36 37 int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); 37 38 }; 38 39 39 40 struct io_cold_def { 40 - /* size of async data needed, if any */ 41 - unsigned short async_size; 42 - 43 41 const char *name; 44 42 45 - int (*prep_async)(struct io_kiocb *); 46 43 void (*cleanup)(struct io_kiocb *); 47 44 void (*fail)(struct io_kiocb *); 48 45 };

+4 -11

io_uring/poll.c

··· 14 14 #include <uapi/linux/io_uring.h> 15 15 16 16 #include "io_uring.h" 17 + #include "alloc_cache.h" 17 18 #include "refs.h" 18 19 #include "napi.h" 19 20 #include "opdef.h" ··· 323 322 __poll_t mask = mangle_poll(req->cqe.res & 324 323 req->apoll_events); 325 324 326 - if (!io_fill_cqe_req_aux(req, ts->locked, mask, 327 - IORING_CQE_F_MORE)) { 325 + if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { 328 326 io_req_set_res(req, mask, 0); 329 327 return IOU_POLL_REMOVE_POLL_USE_RES; 330 328 } ··· 687 687 unsigned issue_flags) 688 688 { 689 689 struct io_ring_ctx *ctx = req->ctx; 690 - struct io_cache_entry *entry; 691 690 struct async_poll *apoll; 692 691 693 692 if (req->flags & REQ_F_POLLED) { 694 693 apoll = req->apoll; 695 694 kfree(apoll->double_poll); 696 695 } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { 697 - entry = io_alloc_cache_get(&ctx->apoll_cache); 698 - if (entry == NULL) 696 + apoll = io_alloc_cache_get(&ctx->apoll_cache); 697 + if (!apoll) 699 698 goto alloc_apoll; 700 - apoll = container_of(entry, struct async_poll, cache); 701 699 apoll->poll.retries = APOLL_MAX_RETRY; 702 700 } else { 703 701 alloc_apoll: ··· 1053 1055 /* complete update request, we're done with it */ 1054 1056 io_req_set_res(req, ret, 0); 1055 1057 return IOU_OK; 1056 - } 1057 - 1058 - void io_apoll_cache_free(struct io_cache_entry *entry) 1059 - { 1060 - kfree(container_of(entry, struct async_poll, cache)); 1061 1058 }

+2 -7

io_uring/poll.h

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 - #include "alloc_cache.h" 3 + #define IO_POLL_ALLOC_CACHE_MAX 32 4 4 5 5 enum { 6 6 IO_APOLL_OK, ··· 17 17 }; 18 18 19 19 struct async_poll { 20 - union { 21 - struct io_poll poll; 22 - struct io_cache_entry cache; 23 - }; 20 + struct io_poll poll; 24 21 struct io_poll *double_poll; 25 22 }; 26 23 ··· 42 45 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); 43 46 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 44 47 bool cancel_all); 45 - 46 - void io_apoll_cache_free(struct io_cache_entry *entry); 47 48 48 49 void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);

+7

io_uring/refs.h

··· 33 33 atomic_inc(&req->refs); 34 34 } 35 35 36 + static inline void req_ref_put(struct io_kiocb *req) 37 + { 38 + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 39 + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 40 + atomic_dec(&req->refs); 41 + } 42 + 36 43 static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) 37 44 { 38 45 if (!(req->flags & REQ_F_REFCOUNT)) {

+1 -2

io_uring/register.c

··· 368 368 369 369 /* now propagate the restriction to all registered users */ 370 370 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 371 - struct io_uring_task *tctx = node->task->io_uring; 372 - 371 + tctx = node->task->io_uring; 373 372 if (WARN_ON_ONCE(!tctx->io_wq)) 374 373 continue; 375 374

+5 -42

io_uring/rsrc.c

··· 13 13 #include <uapi/linux/io_uring.h> 14 14 15 15 #include "io_uring.h" 16 + #include "alloc_cache.h" 16 17 #include "openclose.h" 17 18 #include "rsrc.h" 19 + #include "memmap.h" 18 20 19 21 struct io_rsrc_update { 20 22 struct file *file; ··· 171 169 172 170 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 173 171 { 174 - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) 172 + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) 175 173 kfree(node); 176 174 } 177 175 ··· 199 197 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 200 198 { 201 199 struct io_rsrc_node *ref_node; 202 - struct io_cache_entry *entry; 203 200 204 - entry = io_alloc_cache_get(&ctx->rsrc_node_cache); 205 - if (entry) { 206 - ref_node = container_of(entry, struct io_rsrc_node, cache); 207 - } else { 201 + ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); 202 + if (!ref_node) { 208 203 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 209 204 if (!ref_node) 210 205 return NULL; ··· 869 870 if (ret) 870 871 imu->acct_pages = 0; 871 872 return ret; 872 - } 873 - 874 - struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) 875 - { 876 - unsigned long start, end, nr_pages; 877 - struct page **pages = NULL; 878 - int ret; 879 - 880 - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 881 - start = ubuf >> PAGE_SHIFT; 882 - nr_pages = end - start; 883 - WARN_ON(!nr_pages); 884 - 885 - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 886 - if (!pages) 887 - return ERR_PTR(-ENOMEM); 888 - 889 - mmap_read_lock(current->mm); 890 - ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); 891 - mmap_read_unlock(current->mm); 892 - 893 - /* success, mapped all pages */ 894 - if (ret == nr_pages) { 895 - *npages = nr_pages; 896 - return pages; 897 - } 898 - 899 - /* partial map, or didn't map anything */ 900 - if (ret >= 0) { 901 - /* if we did partial map, release any pages we did get */ 902 - if (ret) 903 - unpin_user_pages(pages, ret); 904 - ret = -EFAULT; 905 - } 906 - kvfree(pages); 907 - return ERR_PTR(ret); 908 873 } 909 874 910 875 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,

+1 -12

io_uring/rsrc.h

··· 2 2 #ifndef IOU_RSRC_H 3 3 #define IOU_RSRC_H 4 4 5 - #include "alloc_cache.h" 6 - 7 5 #define IO_NODE_ALLOC_CACHE_MAX 32 8 6 9 7 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) ··· 34 36 }; 35 37 36 38 struct io_rsrc_node { 37 - union { 38 - struct io_cache_entry cache; 39 - struct io_ring_ctx *ctx; 40 - }; 39 + struct io_ring_ctx *ctx; 41 40 int refs; 42 41 bool empty; 43 42 u16 type; ··· 81 86 82 87 if (node && !--node->refs) 83 88 io_rsrc_node_ref_zero(node); 84 - } 85 - 86 - static inline void io_req_put_rsrc_locked(struct io_kiocb *req, 87 - struct io_ring_ctx *ctx) 88 - { 89 - io_put_rsrc_node(ctx, req->rsrc_node); 90 89 } 91 90 92 91 static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,

+292 -301

io_uring/rw.c

··· 18 18 #include "io_uring.h" 19 19 #include "opdef.h" 20 20 #include "kbuf.h" 21 + #include "alloc_cache.h" 21 22 #include "rsrc.h" 22 23 #include "poll.h" 23 24 #include "rw.h" ··· 76 75 return 0; 77 76 } 78 77 79 - int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 78 + static int __io_import_iovec(int ddir, struct io_kiocb *req, 79 + struct io_async_rw *io, 80 + unsigned int issue_flags) 81 + { 82 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 83 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 84 + struct iovec *iov; 85 + void __user *buf; 86 + int nr_segs, ret; 87 + size_t sqe_len; 88 + 89 + buf = u64_to_user_ptr(rw->addr); 90 + sqe_len = rw->len; 91 + 92 + if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { 93 + if (io_do_buffer_select(req)) { 94 + buf = io_buffer_select(req, &sqe_len, issue_flags); 95 + if (!buf) 96 + return -ENOBUFS; 97 + rw->addr = (unsigned long) buf; 98 + rw->len = sqe_len; 99 + } 100 + 101 + return import_ubuf(ddir, buf, sqe_len, &io->iter); 102 + } 103 + 104 + if (io->free_iovec) { 105 + nr_segs = io->free_iov_nr; 106 + iov = io->free_iovec; 107 + } else { 108 + iov = &io->fast_iov; 109 + nr_segs = 1; 110 + } 111 + ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, 112 + req->ctx->compat); 113 + if (unlikely(ret < 0)) 114 + return ret; 115 + if (iov) { 116 + req->flags |= REQ_F_NEED_CLEANUP; 117 + io->free_iov_nr = io->iter.nr_segs; 118 + kfree(io->free_iovec); 119 + io->free_iovec = iov; 120 + } 121 + return 0; 122 + } 123 + 124 + static inline int io_import_iovec(int rw, struct io_kiocb *req, 125 + struct io_async_rw *io, 126 + unsigned int issue_flags) 127 + { 128 + int ret; 129 + 130 + ret = __io_import_iovec(rw, req, io, issue_flags); 131 + if (unlikely(ret < 0)) 132 + return ret; 133 + 134 + iov_iter_save_state(&io->iter, &io->iter_state); 135 + return 0; 136 + } 137 + 138 + static void io_rw_iovec_free(struct io_async_rw *rw) 139 + { 140 + if (rw->free_iovec) { 141 + kfree(rw->free_iovec); 142 + rw->free_iov_nr = 0; 143 + rw->free_iovec = NULL; 144 + } 145 + } 146 + 147 + static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) 148 + { 149 + struct io_async_rw *rw = req->async_data; 150 + struct iovec *iov; 151 + 152 + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 153 + io_rw_iovec_free(rw); 154 + return; 155 + } 156 + iov = rw->free_iovec; 157 + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { 158 + if (iov) 159 + kasan_mempool_poison_object(iov); 160 + req->async_data = NULL; 161 + req->flags &= ~REQ_F_ASYNC_DATA; 162 + } 163 + } 164 + 165 + static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) 166 + { 167 + /* 168 + * Disable quick recycling for anything that's gone through io-wq. 169 + * In theory, this should be fine to cleanup. However, some read or 170 + * write iter handling touches the iovec AFTER having called into the 171 + * handler, eg to reexpand or revert. This means we can have: 172 + * 173 + * task io-wq 174 + * issue 175 + * punt to io-wq 176 + * issue 177 + * blkdev_write_iter() 178 + * ->ki_complete() 179 + * io_complete_rw() 180 + * queue tw complete 181 + * run tw 182 + * req_rw_cleanup 183 + * iov_iter_count() <- look at iov_iter again 184 + * 185 + * which can lead to a UAF. This is only possible for io-wq offload 186 + * as the cleanup can run in parallel. As io-wq is not the fast path, 187 + * just leave cleanup to the end. 188 + * 189 + * This is really a bug in the core code that does this, any issue 190 + * path should assume that a successful (or -EIOCBQUEUED) return can 191 + * mean that the underlying data can be gone at any time. But that 192 + * should be fixed seperately, and then this check could be killed. 193 + */ 194 + if (!(req->flags & REQ_F_REFCOUNT)) { 195 + req->flags &= ~REQ_F_NEED_CLEANUP; 196 + io_rw_recycle(req, issue_flags); 197 + } 198 + } 199 + 200 + static int io_rw_alloc_async(struct io_kiocb *req) 201 + { 202 + struct io_ring_ctx *ctx = req->ctx; 203 + struct io_async_rw *rw; 204 + 205 + rw = io_alloc_cache_get(&ctx->rw_cache); 206 + if (rw) { 207 + if (rw->free_iovec) { 208 + kasan_mempool_unpoison_object(rw->free_iovec, 209 + rw->free_iov_nr * sizeof(struct iovec)); 210 + req->flags |= REQ_F_NEED_CLEANUP; 211 + } 212 + req->flags |= REQ_F_ASYNC_DATA; 213 + req->async_data = rw; 214 + goto done; 215 + } 216 + 217 + if (!io_alloc_async_data(req)) { 218 + rw = req->async_data; 219 + rw->free_iovec = NULL; 220 + rw->free_iov_nr = 0; 221 + done: 222 + rw->bytes_done = 0; 223 + return 0; 224 + } 225 + 226 + return -ENOMEM; 227 + } 228 + 229 + static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) 230 + { 231 + struct io_async_rw *rw; 232 + int ret; 233 + 234 + if (io_rw_alloc_async(req)) 235 + return -ENOMEM; 236 + 237 + if (!do_import || io_do_buffer_select(req)) 238 + return 0; 239 + 240 + rw = req->async_data; 241 + ret = io_import_iovec(ddir, req, rw, 0); 242 + if (unlikely(ret < 0)) 243 + return ret; 244 + 245 + iov_iter_save_state(&rw->iter, &rw->iter_state); 246 + return 0; 247 + } 248 + 249 + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 250 + int ddir, bool do_import) 80 251 { 81 252 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 82 253 unsigned ioprio; ··· 273 100 rw->addr = READ_ONCE(sqe->addr); 274 101 rw->len = READ_ONCE(sqe->len); 275 102 rw->flags = READ_ONCE(sqe->rw_flags); 276 - return 0; 103 + return io_prep_rw_setup(req, ddir, do_import); 277 104 } 278 105 279 - int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) 106 + int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) 280 107 { 108 + return io_prep_rw(req, sqe, ITER_DEST, true); 109 + } 110 + 111 + int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) 112 + { 113 + return io_prep_rw(req, sqe, ITER_SOURCE, true); 114 + } 115 + 116 + static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, 117 + int ddir) 118 + { 119 + const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); 281 120 int ret; 282 121 283 - ret = io_prep_rw(req, sqe); 122 + ret = io_prep_rw(req, sqe, ddir, do_import); 284 123 if (unlikely(ret)) 285 124 return ret; 125 + if (do_import) 126 + return 0; 286 127 287 128 /* 288 129 * Have to do this validation here, as this is in io_read() rw->len 289 130 * might have chanaged due to buffer selection 290 131 */ 291 - if (req->flags & REQ_F_BUFFER_SELECT) 292 - return io_iov_buffer_select_prep(req); 293 - 294 - return 0; 132 + return io_iov_buffer_select_prep(req); 295 133 } 296 134 297 - int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 135 + int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) 298 136 { 137 + return io_prep_rwv(req, sqe, ITER_DEST); 138 + } 139 + 140 + int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) 141 + { 142 + return io_prep_rwv(req, sqe, ITER_SOURCE); 143 + } 144 + 145 + static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, 146 + int ddir) 147 + { 148 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 299 149 struct io_ring_ctx *ctx = req->ctx; 150 + struct io_async_rw *io; 300 151 u16 index; 301 152 int ret; 302 153 303 - ret = io_prep_rw(req, sqe); 154 + ret = io_prep_rw(req, sqe, ddir, false); 304 155 if (unlikely(ret)) 305 156 return ret; 306 157 ··· 333 136 index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); 334 137 req->imu = ctx->user_bufs[index]; 335 138 io_req_set_rsrc_node(req, ctx, 0); 336 - return 0; 139 + 140 + io = req->async_data; 141 + ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); 142 + iov_iter_save_state(&io->iter, &io->iter_state); 143 + return ret; 144 + } 145 + 146 + int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 147 + { 148 + return io_prep_rw_fixed(req, sqe, ITER_DEST); 149 + } 150 + 151 + int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 152 + { 153 + return io_prep_rw_fixed(req, sqe, ITER_SOURCE); 337 154 } 338 155 339 156 /* ··· 363 152 if (!(req->flags & REQ_F_BUFFER_SELECT)) 364 153 return -EINVAL; 365 154 366 - ret = io_prep_rw(req, sqe); 155 + ret = io_prep_rw(req, sqe, ITER_DEST, false); 367 156 if (unlikely(ret)) 368 157 return ret; 369 158 ··· 376 165 377 166 void io_readv_writev_cleanup(struct io_kiocb *req) 378 167 { 379 - struct io_async_rw *io = req->async_data; 380 - 381 - kfree(io->free_iovec); 168 + io_rw_iovec_free(req->async_data); 382 169 } 383 170 384 171 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) ··· 396 187 return NULL; 397 188 } 398 189 399 - static void io_req_task_queue_reissue(struct io_kiocb *req) 400 - { 401 - req->io_task_work.func = io_queue_iowq; 402 - io_req_task_work_add(req); 403 - } 404 - 405 190 #ifdef CONFIG_BLOCK 406 - static bool io_resubmit_prep(struct io_kiocb *req) 191 + static void io_resubmit_prep(struct io_kiocb *req) 407 192 { 408 193 struct io_async_rw *io = req->async_data; 409 194 410 - if (!req_has_async_data(req)) 411 - return !io_req_prep_async(req); 412 - iov_iter_restore(&io->s.iter, &io->s.iter_state); 413 - return true; 195 + iov_iter_restore(&io->iter, &io->iter_state); 414 196 } 415 197 416 198 static bool io_rw_should_reissue(struct io_kiocb *req) ··· 430 230 return true; 431 231 } 432 232 #else 433 - static bool io_resubmit_prep(struct io_kiocb *req) 233 + static void io_resubmit_prep(struct io_kiocb *req) 434 234 { 435 - return false; 436 235 } 437 236 static bool io_rw_should_reissue(struct io_kiocb *req) 438 237 { ··· 510 311 511 312 io_req_io_end(req); 512 313 513 - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { 514 - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; 314 + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 315 + req->cqe.flags |= io_put_kbuf(req, 0); 515 316 516 - req->cqe.flags |= io_put_kbuf(req, issue_flags); 517 - } 317 + io_req_rw_cleanup(req, 0); 518 318 io_req_task_complete(req, ts); 519 319 } 520 320 ··· 594 396 io_req_io_end(req); 595 397 io_req_set_res(req, final_ret, 596 398 io_put_kbuf(req, issue_flags)); 399 + io_req_rw_cleanup(req, issue_flags); 597 400 return IOU_OK; 598 401 } 599 402 } else { ··· 603 404 604 405 if (req->flags & REQ_F_REISSUE) { 605 406 req->flags &= ~REQ_F_REISSUE; 606 - if (io_resubmit_prep(req)) 607 - io_req_task_queue_reissue(req); 608 - else 609 - io_req_task_queue_fail(req, final_ret); 407 + io_resubmit_prep(req); 408 + return -EAGAIN; 610 409 } 611 410 return IOU_ISSUE_SKIP_COMPLETE; 612 - } 613 - 614 - static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, 615 - struct io_rw_state *s, 616 - unsigned int issue_flags) 617 - { 618 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 619 - struct iov_iter *iter = &s->iter; 620 - u8 opcode = req->opcode; 621 - struct iovec *iovec; 622 - void __user *buf; 623 - size_t sqe_len; 624 - ssize_t ret; 625 - 626 - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 627 - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); 628 - if (ret) 629 - return ERR_PTR(ret); 630 - return NULL; 631 - } 632 - 633 - buf = u64_to_user_ptr(rw->addr); 634 - sqe_len = rw->len; 635 - 636 - if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) { 637 - if (io_do_buffer_select(req)) { 638 - buf = io_buffer_select(req, &sqe_len, issue_flags); 639 - if (!buf) 640 - return ERR_PTR(-ENOBUFS); 641 - rw->addr = (unsigned long) buf; 642 - rw->len = sqe_len; 643 - } 644 - 645 - ret = import_ubuf(ddir, buf, sqe_len, iter); 646 - if (ret) 647 - return ERR_PTR(ret); 648 - return NULL; 649 - } 650 - 651 - iovec = s->fast_iov; 652 - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, 653 - req->ctx->compat); 654 - if (unlikely(ret < 0)) 655 - return ERR_PTR(ret); 656 - return iovec; 657 - } 658 - 659 - static inline int io_import_iovec(int rw, struct io_kiocb *req, 660 - struct iovec **iovec, struct io_rw_state *s, 661 - unsigned int issue_flags) 662 - { 663 - *iovec = __io_import_iovec(rw, req, s, issue_flags); 664 - if (IS_ERR(*iovec)) 665 - return PTR_ERR(*iovec); 666 - 667 - iov_iter_save_state(&s->iter, &s->iter_state); 668 - return 0; 669 411 } 670 412 671 413 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) ··· 678 538 } 679 539 680 540 return ret; 681 - } 682 - 683 - static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 684 - const struct iovec *fast_iov, struct iov_iter *iter) 685 - { 686 - struct io_async_rw *io = req->async_data; 687 - 688 - memcpy(&io->s.iter, iter, sizeof(*iter)); 689 - io->free_iovec = iovec; 690 - io->bytes_done = 0; 691 - /* can only be fixed buffers, no need to do anything */ 692 - if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) 693 - return; 694 - if (!iovec) { 695 - unsigned iov_off = 0; 696 - 697 - io->s.iter.__iov = io->s.fast_iov; 698 - if (iter->__iov != fast_iov) { 699 - iov_off = iter_iov(iter) - fast_iov; 700 - io->s.iter.__iov += iov_off; 701 - } 702 - if (io->s.fast_iov != fast_iov) 703 - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, 704 - sizeof(struct iovec) * iter->nr_segs); 705 - } else { 706 - req->flags |= REQ_F_NEED_CLEANUP; 707 - } 708 - } 709 - 710 - static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 711 - struct io_rw_state *s, bool force) 712 - { 713 - if (!force && !io_cold_defs[req->opcode].prep_async) 714 - return 0; 715 - /* opcode type doesn't need async data */ 716 - if (!io_cold_defs[req->opcode].async_size) 717 - return 0; 718 - if (!req_has_async_data(req)) { 719 - struct io_async_rw *iorw; 720 - 721 - if (io_alloc_async_data(req)) { 722 - kfree(iovec); 723 - return -ENOMEM; 724 - } 725 - 726 - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); 727 - iorw = req->async_data; 728 - /* we've copied and mapped the iter, ensure state is saved */ 729 - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); 730 - } 731 - return 0; 732 - } 733 - 734 - static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 735 - { 736 - struct io_async_rw *iorw = req->async_data; 737 - struct iovec *iov; 738 - int ret; 739 - 740 - iorw->bytes_done = 0; 741 - iorw->free_iovec = NULL; 742 - 743 - /* submission path, ->uring_lock should already be taken */ 744 - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); 745 - if (unlikely(ret < 0)) 746 - return ret; 747 - 748 - if (iov) { 749 - iorw->free_iovec = iov; 750 - req->flags |= REQ_F_NEED_CLEANUP; 751 - } 752 - 753 - return 0; 754 - } 755 - 756 - int io_readv_prep_async(struct io_kiocb *req) 757 - { 758 - return io_rw_prep_async(req, ITER_DEST); 759 - } 760 - 761 - int io_writev_prep_async(struct io_kiocb *req) 762 - { 763 - return io_rw_prep_async(req, ITER_SOURCE); 764 541 } 765 542 766 543 /* ··· 820 763 821 764 static int __io_read(struct io_kiocb *req, unsigned int issue_flags) 822 765 { 823 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 824 - struct io_rw_state __s, *s = &__s; 825 - struct iovec *iovec; 826 - struct kiocb *kiocb = &rw->kiocb; 827 766 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 828 - struct io_async_rw *io; 829 - ssize_t ret, ret2; 767 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 768 + struct io_async_rw *io = req->async_data; 769 + struct kiocb *kiocb = &rw->kiocb; 770 + ssize_t ret; 830 771 loff_t *ppos; 831 772 832 - if (!req_has_async_data(req)) { 833 - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); 773 + if (io_do_buffer_select(req)) { 774 + ret = io_import_iovec(ITER_DEST, req, io, issue_flags); 834 775 if (unlikely(ret < 0)) 835 776 return ret; 836 - } else { 837 - io = req->async_data; 838 - s = &io->s; 839 - 840 - /* 841 - * Safe and required to re-import if we're using provided 842 - * buffers, as we dropped the selected one before retry. 843 - */ 844 - if (io_do_buffer_select(req)) { 845 - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); 846 - if (unlikely(ret < 0)) 847 - return ret; 848 - } 849 - 850 - /* 851 - * We come here from an earlier attempt, restore our state to 852 - * match in case it doesn't. It's cheap enough that we don't 853 - * need to make this conditional. 854 - */ 855 - iov_iter_restore(&s->iter, &s->iter_state); 856 - iovec = NULL; 857 777 } 778 + 858 779 ret = io_rw_init_file(req, FMODE_READ); 859 - if (unlikely(ret)) { 860 - kfree(iovec); 780 + if (unlikely(ret)) 861 781 return ret; 862 - } 863 - req->cqe.res = iov_iter_count(&s->iter); 782 + req->cqe.res = iov_iter_count(&io->iter); 864 783 865 784 if (force_nonblock) { 866 785 /* If the file doesn't support async, just async punt */ 867 - if (unlikely(!io_file_supports_nowait(req))) { 868 - ret = io_setup_async_rw(req, iovec, s, true); 869 - return ret ?: -EAGAIN; 870 - } 786 + if (unlikely(!io_file_supports_nowait(req))) 787 + return -EAGAIN; 871 788 kiocb->ki_flags |= IOCB_NOWAIT; 872 789 } else { 873 790 /* Ensure we clear previously set non-block flag */ ··· 851 820 ppos = io_kiocb_update_pos(req); 852 821 853 822 ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); 854 - if (unlikely(ret)) { 855 - kfree(iovec); 823 + if (unlikely(ret)) 856 824 return ret; 857 - } 858 825 859 - ret = io_iter_do_read(rw, &s->iter); 826 + ret = io_iter_do_read(rw, &io->iter); 860 827 861 828 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 862 829 req->flags &= ~REQ_F_REISSUE; 863 - /* 864 - * If we can poll, just do that. For a vectored read, we'll 865 - * need to copy state first. 866 - */ 867 - if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored) 830 + /* If we can poll, just do that. */ 831 + if (io_file_can_poll(req)) 868 832 return -EAGAIN; 869 833 /* IOPOLL retry should happen for io-wq threads */ 870 834 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) ··· 869 843 goto done; 870 844 ret = 0; 871 845 } else if (ret == -EIOCBQUEUED) { 872 - if (iovec) 873 - kfree(iovec); 874 846 return IOU_ISSUE_SKIP_COMPLETE; 875 847 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || 876 848 (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { ··· 881 857 * untouched in case of error. Restore it and we'll advance it 882 858 * manually if we need to. 883 859 */ 884 - iov_iter_restore(&s->iter, &s->iter_state); 885 - 886 - ret2 = io_setup_async_rw(req, iovec, s, true); 887 - iovec = NULL; 888 - if (ret2) { 889 - ret = ret > 0 ? ret : ret2; 890 - goto done; 891 - } 892 - 893 - io = req->async_data; 894 - s = &io->s; 895 - /* 896 - * Now use our persistent iterator and state, if we aren't already. 897 - * We've restored and mapped the iter to match. 898 - */ 860 + iov_iter_restore(&io->iter, &io->iter_state); 899 861 900 862 do { 901 863 /* ··· 889 879 * above or inside this loop. Advance the iter by the bytes 890 880 * that were consumed. 891 881 */ 892 - iov_iter_advance(&s->iter, ret); 893 - if (!iov_iter_count(&s->iter)) 882 + iov_iter_advance(&io->iter, ret); 883 + if (!iov_iter_count(&io->iter)) 894 884 break; 895 885 io->bytes_done += ret; 896 - iov_iter_save_state(&s->iter, &s->iter_state); 886 + iov_iter_save_state(&io->iter, &io->iter_state); 897 887 898 888 /* if we can retry, do so with the callbacks armed */ 899 889 if (!io_rw_should_retry(req)) { ··· 901 891 return -EAGAIN; 902 892 } 903 893 904 - req->cqe.res = iov_iter_count(&s->iter); 894 + req->cqe.res = iov_iter_count(&io->iter); 905 895 /* 906 896 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 907 897 * we get -EIOCBQUEUED, then we'll get a notification when the 908 898 * desired page gets unlocked. We can also get a partial read 909 899 * here, and if we do, then just retry at the new offset. 910 900 */ 911 - ret = io_iter_do_read(rw, &s->iter); 901 + ret = io_iter_do_read(rw, &io->iter); 912 902 if (ret == -EIOCBQUEUED) 913 903 return IOU_ISSUE_SKIP_COMPLETE; 914 904 /* we got some bytes, but not all. retry. */ 915 905 kiocb->ki_flags &= ~IOCB_WAITQ; 916 - iov_iter_restore(&s->iter, &s->iter_state); 906 + iov_iter_restore(&io->iter, &io->iter_state); 917 907 } while (ret > 0); 918 908 done: 919 909 /* it's faster to check here then delegate to kfree */ 920 - if (iovec) 921 - kfree(iovec); 922 910 return ret; 923 911 } 924 912 ··· 979 971 cflags = io_put_kbuf(req, issue_flags); 980 972 rw->len = 0; /* similarly to above, reset len to 0 */ 981 973 982 - if (io_fill_cqe_req_aux(req, 983 - issue_flags & IO_URING_F_COMPLETE_DEFER, 984 - ret, cflags | IORING_CQE_F_MORE)) { 974 + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 985 975 if (issue_flags & IO_URING_F_MULTISHOT) { 986 976 /* 987 977 * Force retry, as we might have more data to ··· 998 992 * multishot request, hitting overflow will terminate it. 999 993 */ 1000 994 io_req_set_res(req, ret, cflags); 995 + io_req_rw_cleanup(req, issue_flags); 1001 996 if (issue_flags & IO_URING_F_MULTISHOT) 1002 997 return IOU_STOP_MULTISHOT; 1003 998 return IOU_OK; ··· 1006 999 1007 1000 int io_write(struct io_kiocb *req, unsigned int issue_flags) 1008 1001 { 1009 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1010 - struct io_rw_state __s, *s = &__s; 1011 - struct iovec *iovec; 1012 - struct kiocb *kiocb = &rw->kiocb; 1013 1002 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1003 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1004 + struct io_async_rw *io = req->async_data; 1005 + struct kiocb *kiocb = &rw->kiocb; 1014 1006 ssize_t ret, ret2; 1015 1007 loff_t *ppos; 1016 1008 1017 - if (!req_has_async_data(req)) { 1018 - ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags); 1019 - if (unlikely(ret < 0)) 1020 - return ret; 1021 - } else { 1022 - struct io_async_rw *io = req->async_data; 1023 - 1024 - s = &io->s; 1025 - iov_iter_restore(&s->iter, &s->iter_state); 1026 - iovec = NULL; 1027 - } 1028 1009 ret = io_rw_init_file(req, FMODE_WRITE); 1029 - if (unlikely(ret)) { 1030 - kfree(iovec); 1010 + if (unlikely(ret)) 1031 1011 return ret; 1032 - } 1033 - req->cqe.res = iov_iter_count(&s->iter); 1012 + req->cqe.res = iov_iter_count(&io->iter); 1034 1013 1035 1014 if (force_nonblock) { 1036 1015 /* If the file doesn't support async, just async punt */ 1037 1016 if (unlikely(!io_file_supports_nowait(req))) 1038 - goto copy_iov; 1017 + goto ret_eagain; 1039 1018 1040 1019 /* Check if we can support NOWAIT. */ 1041 1020 if (!(kiocb->ki_flags & IOCB_DIRECT) && 1042 1021 !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && 1043 1022 (req->flags & REQ_F_ISREG)) 1044 - goto copy_iov; 1023 + goto ret_eagain; 1045 1024 1046 1025 kiocb->ki_flags |= IOCB_NOWAIT; 1047 1026 } else { ··· 1038 1045 ppos = io_kiocb_update_pos(req); 1039 1046 1040 1047 ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); 1041 - if (unlikely(ret)) { 1042 - kfree(iovec); 1048 + if (unlikely(ret)) 1043 1049 return ret; 1044 - } 1045 1050 1046 1051 if (req->flags & REQ_F_ISREG) 1047 1052 kiocb_start_write(kiocb); 1048 1053 kiocb->ki_flags |= IOCB_WRITE; 1049 1054 1050 1055 if (likely(req->file->f_op->write_iter)) 1051 - ret2 = call_write_iter(req->file, kiocb, &s->iter); 1056 + ret2 = call_write_iter(req->file, kiocb, &io->iter); 1052 1057 else if (req->file->f_op->write) 1053 - ret2 = loop_rw_iter(WRITE, rw, &s->iter); 1058 + ret2 = loop_rw_iter(WRITE, rw, &io->iter); 1054 1059 else 1055 1060 ret2 = -EINVAL; 1056 1061 ··· 1069 1078 if (!force_nonblock || ret2 != -EAGAIN) { 1070 1079 /* IOPOLL retry should happen for io-wq threads */ 1071 1080 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 1072 - goto copy_iov; 1081 + goto ret_eagain; 1073 1082 1074 1083 if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { 1075 - struct io_async_rw *io; 1076 - 1077 1084 trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, 1078 1085 req->cqe.res, ret2); 1079 1086 ··· 1080 1091 * in the worker. Also update bytes_done to account for 1081 1092 * the bytes already written. 1082 1093 */ 1083 - iov_iter_save_state(&s->iter, &s->iter_state); 1084 - ret = io_setup_async_rw(req, iovec, s, true); 1094 + iov_iter_save_state(&io->iter, &io->iter_state); 1095 + io->bytes_done += ret2; 1085 1096 1086 - io = req->async_data; 1087 - if (io) 1088 - io->bytes_done += ret2; 1089 - 1090 - if (kiocb->ki_flags & IOCB_WRITE) 1091 - io_req_end_write(req); 1092 - return ret ? ret : -EAGAIN; 1093 - } 1094 - done: 1095 - ret = kiocb_done(req, ret2, issue_flags); 1096 - } else { 1097 - copy_iov: 1098 - iov_iter_restore(&s->iter, &s->iter_state); 1099 - ret = io_setup_async_rw(req, iovec, s, false); 1100 - if (!ret) { 1101 1097 if (kiocb->ki_flags & IOCB_WRITE) 1102 1098 io_req_end_write(req); 1103 1099 return -EAGAIN; 1104 1100 } 1105 - return ret; 1101 + done: 1102 + return kiocb_done(req, ret2, issue_flags); 1103 + } else { 1104 + ret_eagain: 1105 + iov_iter_restore(&io->iter, &io->iter_state); 1106 + if (kiocb->ki_flags & IOCB_WRITE) 1107 + io_req_end_write(req); 1108 + return -EAGAIN; 1106 1109 } 1107 - /* it's reportedly faster than delegating the null check to kfree() */ 1108 - if (iovec) 1109 - kfree(iovec); 1110 - return ret; 1111 1110 } 1112 1111 1113 1112 void io_rw_fail(struct io_kiocb *req) ··· 1169 1192 break; 1170 1193 nr_events++; 1171 1194 req->cqe.flags = io_put_kbuf(req, 0); 1195 + if (req->opcode != IORING_OP_URING_CMD) 1196 + io_req_rw_cleanup(req, 0); 1172 1197 } 1173 1198 if (unlikely(!nr_events)) 1174 1199 return 0; ··· 1183 1204 ctx->submit_state.compl_reqs.first = pos; 1184 1205 __io_submit_flush_completions(ctx); 1185 1206 return nr_events; 1207 + } 1208 + 1209 + void io_rw_cache_free(const void *entry) 1210 + { 1211 + struct io_async_rw *rw = (struct io_async_rw *) entry; 1212 + 1213 + if (rw->free_iovec) { 1214 + kasan_mempool_unpoison_object(rw->free_iovec, 1215 + rw->free_iov_nr * sizeof(struct iovec)); 1216 + io_rw_iovec_free(rw); 1217 + } 1218 + kfree(rw); 1186 1219 }

+12 -13

io_uring/rw.h

··· 2 2 3 3 #include <linux/pagemap.h> 4 4 5 - struct io_rw_state { 5 + struct io_async_rw { 6 + size_t bytes_done; 6 7 struct iov_iter iter; 7 8 struct iov_iter_state iter_state; 8 - struct iovec fast_iov[UIO_FASTIOV]; 9 - }; 10 - 11 - struct io_async_rw { 12 - struct io_rw_state s; 13 - const struct iovec *free_iovec; 14 - size_t bytes_done; 9 + struct iovec fast_iov; 10 + struct iovec *free_iovec; 11 + int free_iov_nr; 15 12 struct wait_page_queue wpq; 16 13 }; 17 14 18 - int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); 19 - int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); 20 - int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); 15 + int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); 16 + int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); 17 + int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); 18 + int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); 19 + int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); 20 + int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); 21 21 int io_read(struct io_kiocb *req, unsigned int issue_flags); 22 - int io_readv_prep_async(struct io_kiocb *req); 23 22 int io_write(struct io_kiocb *req, unsigned int issue_flags); 24 - int io_writev_prep_async(struct io_kiocb *req); 25 23 void io_readv_writev_cleanup(struct io_kiocb *req); 26 24 void io_rw_fail(struct io_kiocb *req); 27 25 void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); 28 26 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 29 27 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); 28 + void io_rw_cache_free(const void *entry);

+8

io_uring/sqpoll.c

··· 291 291 sqd->sq_cpu = raw_smp_processor_id(); 292 292 } 293 293 294 + /* 295 + * Force audit context to get setup, in case we do prep side async 296 + * operations that would trigger an audit call before any issue side 297 + * audit has been done. 298 + */ 299 + audit_uring_entry(IORING_OP_NOP); 300 + audit_uring_exit(true, 0); 301 + 294 302 mutex_lock(&sqd->lock); 295 303 while (1) { 296 304 bool cap_entries, sqt_spin = false;

+2 -7

io_uring/timeout.c

··· 72 72 struct io_ring_ctx *ctx = req->ctx; 73 73 74 74 if (!io_timeout_finish(timeout, data)) { 75 - bool filled; 76 - filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, 77 - IORING_CQE_F_MORE); 78 - if (filled) { 75 + if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { 79 76 /* re-arm timer */ 80 77 spin_lock_irq(&ctx->timeout_lock); 81 78 list_add(&timeout->list, ctx->timeout_list.prev); ··· 298 301 299 302 static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) 300 303 { 301 - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; 302 304 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 303 305 struct io_kiocb *prev = timeout->prev; 304 306 int ret = -ENOENT; ··· 309 313 .data = prev->cqe.user_data, 310 314 }; 311 315 312 - ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); 316 + ret = io_try_cancel(req->task->io_uring, &cd, 0); 313 317 } 314 318 io_req_set_res(req, ret ?: -ETIME, 0); 315 319 io_req_task_complete(req, ts); ··· 537 541 if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) 538 542 return -EINVAL; 539 543 540 - INIT_LIST_HEAD(&timeout->list); 541 544 data->mode = io_translate_timeout_mode(flags); 542 545 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); 543 546

+99 -23

io_uring/uring_cmd.c

··· 3 3 #include <linux/errno.h> 4 4 #include <linux/file.h> 5 5 #include <linux/io_uring/cmd.h> 6 + #include <linux/io_uring/net.h> 6 7 #include <linux/security.h> 7 8 #include <linux/nospec.h> 8 9 #include <net/sock.h> ··· 12 11 #include <asm/ioctls.h> 13 12 14 13 #include "io_uring.h" 14 + #include "alloc_cache.h" 15 15 #include "rsrc.h" 16 16 #include "uring_cmd.h" 17 + 18 + static struct uring_cache *io_uring_async_get(struct io_kiocb *req) 19 + { 20 + struct io_ring_ctx *ctx = req->ctx; 21 + struct uring_cache *cache; 22 + 23 + cache = io_alloc_cache_get(&ctx->uring_cache); 24 + if (cache) { 25 + req->flags |= REQ_F_ASYNC_DATA; 26 + req->async_data = cache; 27 + return cache; 28 + } 29 + if (!io_alloc_async_data(req)) 30 + return req->async_data; 31 + return NULL; 32 + } 33 + 34 + static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) 35 + { 36 + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 37 + struct uring_cache *cache = req->async_data; 38 + 39 + if (issue_flags & IO_URING_F_UNLOCKED) 40 + return; 41 + if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { 42 + ioucmd->sqe = NULL; 43 + req->async_data = NULL; 44 + req->flags &= ~REQ_F_ASYNC_DATA; 45 + } 46 + } 47 + 48 + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, 49 + struct task_struct *task, bool cancel_all) 50 + { 51 + struct hlist_node *tmp; 52 + struct io_kiocb *req; 53 + bool ret = false; 54 + 55 + lockdep_assert_held(&ctx->uring_lock); 56 + 57 + hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, 58 + hash_node) { 59 + struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, 60 + struct io_uring_cmd); 61 + struct file *file = req->file; 62 + 63 + if (!cancel_all && req->task != task) 64 + continue; 65 + 66 + if (cmd->flags & IORING_URING_CMD_CANCELABLE) { 67 + /* ->sqe isn't available if no async data */ 68 + if (!req_has_async_data(req)) 69 + cmd->sqe = NULL; 70 + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | 71 + IO_URING_F_COMPLETE_DEFER); 72 + ret = true; 73 + } 74 + } 75 + io_submit_flush_completions(ctx); 76 + return ret; 77 + } 17 78 18 79 static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, 19 80 unsigned int issue_flags) ··· 119 56 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) 120 57 { 121 58 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 122 - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; 123 59 124 - ioucmd->task_work_cb(ioucmd, issue_flags); 60 + /* task_work executor checks the deffered list completion */ 61 + ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); 125 62 } 126 63 127 64 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, ··· 160 97 io_req_set_res(req, ret, 0); 161 98 if (req->ctx->flags & IORING_SETUP_CQE32) 162 99 io_req_set_cqe32_extra(req, res2, 0); 100 + io_req_uring_cleanup(req, issue_flags); 163 101 if (req->ctx->flags & IORING_SETUP_IOPOLL) { 164 102 /* order with io_iopoll_req_issued() checking ->iopoll_complete */ 165 103 smp_store_release(&req->iopoll_completed, 1); 104 + } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { 105 + if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED)) 106 + return; 107 + io_req_complete_defer(req); 166 108 } else { 167 - struct io_tw_state ts = { 168 - .locked = !(issue_flags & IO_URING_F_UNLOCKED), 169 - }; 170 - io_req_task_complete(req, &ts); 109 + req->io_task_work.func = io_req_task_complete; 110 + io_req_task_work_add(req); 171 111 } 172 112 } 173 113 EXPORT_SYMBOL_GPL(io_uring_cmd_done); 174 114 175 - int io_uring_cmd_prep_async(struct io_kiocb *req) 115 + static int io_uring_cmd_prep_setup(struct io_kiocb *req, 116 + const struct io_uring_sqe *sqe) 176 117 { 177 118 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 119 + struct uring_cache *cache; 178 120 179 - memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); 121 + cache = io_uring_async_get(req); 122 + if (unlikely(!cache)) 123 + return -ENOMEM; 124 + 125 + if (!(req->flags & REQ_F_FORCE_ASYNC)) { 126 + /* defer memcpy until we need it */ 127 + ioucmd->sqe = sqe; 128 + return 0; 129 + } 130 + 131 + memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); 180 132 ioucmd->sqe = req->async_data; 181 133 return 0; 182 134 } ··· 218 140 req->imu = ctx->user_bufs[index]; 219 141 io_req_set_rsrc_node(req, ctx, 0); 220 142 } 221 - ioucmd->sqe = sqe; 222 143 ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); 223 - return 0; 144 + 145 + return io_uring_cmd_prep_setup(req, sqe); 224 146 } 225 147 226 148 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ··· 252 174 253 175 ret = file->f_op->uring_cmd(ioucmd, issue_flags); 254 176 if (ret == -EAGAIN) { 255 - if (!req_has_async_data(req)) { 256 - if (io_alloc_async_data(req)) 257 - return -ENOMEM; 258 - io_uring_cmd_prep_async(req); 259 - } 177 + struct uring_cache *cache = req->async_data; 178 + 179 + if (ioucmd->sqe != (void *) cache) 180 + memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); 260 181 return -EAGAIN; 182 + } else if (ret == -EIOCBQUEUED) { 183 + return -EIOCBQUEUED; 261 184 } 262 185 263 - if (ret != -EIOCBQUEUED) { 264 - if (ret < 0) 265 - req_set_fail(req); 266 - io_req_set_res(req, ret, 0); 267 - return ret; 268 - } 269 - 270 - return IOU_ISSUE_SKIP_COMPLETE; 186 + if (ret < 0) 187 + req_set_fail(req); 188 + io_req_uring_cleanup(req, issue_flags); 189 + io_req_set_res(req, ret, 0); 190 + return ret; 271 191 } 272 192 273 193 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,

+7 -1

io_uring/uring_cmd.h

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + struct uring_cache { 4 + struct io_uring_sqe sqes[2]; 5 + }; 6 + 3 7 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); 4 8 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 5 - int io_uring_cmd_prep_async(struct io_kiocb *req); 9 + 10 + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, 11 + struct task_struct *task, bool cancel_all);

+1 -1

io_uring/waitid.c

··· 118 118 static void io_waitid_complete(struct io_kiocb *req, int ret) 119 119 { 120 120 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 121 - struct io_tw_state ts = { .locked = true }; 121 + struct io_tw_state ts = {}; 122 122 123 123 /* anyone completing better be holding a reference */ 124 124 WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));

+7

mm/nommu.c

··· 355 355 } 356 356 EXPORT_SYMBOL(vm_insert_page); 357 357 358 + int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, 359 + struct page **pages, unsigned long *num) 360 + { 361 + return -EINVAL; 362 + } 363 + EXPORT_SYMBOL(vm_insert_pages); 364 + 358 365 int vm_map_pages(struct vm_area_struct *vma, struct page **pages, 359 366 unsigned long num) 360 367 {

+24 -12

net/core/skbuff.c

··· 1708 1708 return NULL; 1709 1709 } 1710 1710 1711 - uarg->ubuf.callback = msg_zerocopy_callback; 1711 + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; 1712 1712 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1713 1713 uarg->len = 1; 1714 1714 uarg->bytelen = size; ··· 1734 1734 u32 bytelen, next; 1735 1735 1736 1736 /* there might be non MSG_ZEROCOPY users */ 1737 - if (uarg->callback != msg_zerocopy_callback) 1737 + if (uarg->ops != &msg_zerocopy_ubuf_ops) 1738 1738 return NULL; 1739 1739 1740 1740 /* realloc only when socket is locked (TCP, UDP cork), ··· 1845 1845 sock_put(sk); 1846 1846 } 1847 1847 1848 - void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1849 - bool success) 1848 + static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, 1849 + bool success) 1850 1850 { 1851 1851 struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); 1852 1852 ··· 1855 1855 if (refcount_dec_and_test(&uarg->refcnt)) 1856 1856 __msg_zerocopy_callback(uarg_zc); 1857 1857 } 1858 - EXPORT_SYMBOL_GPL(msg_zerocopy_callback); 1859 1858 1860 1859 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1861 1860 { ··· 1864 1865 uarg_to_msgzc(uarg)->len--; 1865 1866 1866 1867 if (have_uref) 1867 - msg_zerocopy_callback(NULL, uarg, true); 1868 + msg_zerocopy_complete(NULL, uarg, true); 1868 1869 } 1869 1870 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); 1871 + 1872 + const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { 1873 + .complete = msg_zerocopy_complete, 1874 + }; 1875 + EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); 1870 1876 1871 1877 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1872 1878 struct msghdr *msg, int len, ··· 1880 1876 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1881 1877 int err, orig_len = skb->len; 1882 1878 1883 - /* An skb can only point to one uarg. This edge case happens when 1884 - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1885 - */ 1886 - if (orig_uarg && uarg != orig_uarg) 1887 - return -EEXIST; 1879 + if (uarg->ops->link_skb) { 1880 + err = uarg->ops->link_skb(skb, uarg); 1881 + if (err) 1882 + return err; 1883 + } else { 1884 + /* An skb can only point to one uarg. This edge case happens 1885 + * when TCP appends to an skb, but zerocopy_realloc triggered 1886 + * a new alloc. 1887 + */ 1888 + if (orig_uarg && uarg != orig_uarg) 1889 + return -EEXIST; 1890 + } 1888 1891 1889 1892 err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); 1890 1893 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { ··· 1905 1894 return err; 1906 1895 } 1907 1896 1908 - skb_zcopy_set(skb, uarg, NULL); 1897 + if (!uarg->ops->link_skb) 1898 + skb_zcopy_set(skb, uarg, NULL); 1909 1899 return skb->len - orig_len; 1910 1900 } 1911 1901 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

+1 -1

net/socket.c

··· 88 88 #include <linux/xattr.h> 89 89 #include <linux/nospec.h> 90 90 #include <linux/indirect_call_wrapper.h> 91 - #include <linux/io_uring.h> 91 + #include <linux/io_uring/net.h> 92 92 93 93 #include <linux/uaccess.h> 94 94 #include <asm/unistd.h>

Configure Feed

Configure Feed