Merge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull io_uring updates from Jens Axboe:
"Here are the io_uring updates queued up for 6.11.

Nothing major this time around, various minor improvements and
cleanups/fixes. This contains:

- Add bind/listen opcodes. Main motivation is to support direct
descriptors, to avoid needing a regular fd just for doing these two
operations (Gabriel)

- Probe fixes (Gabriel)

- Treat io-wq work flags as atomics. Not fixing a real issue, but may
as well and it silences a KCSAN warning (me)

- Cleanup of rsrc __set_current_state() usage (me)

- Add 64-bit for {m,f}advise operations (me)

- Improve performance of data ring messages (me)

- Fix for ring message overflow posting (Pavel)

- Fix for freezer interaction with TWA_NOTIFY_SIGNAL. Not strictly an
io_uring thing, but since TWA_NOTIFY_SIGNAL was originally added
for faster task_work signaling for io_uring, bundling it with this
pull (Pavel)

- Add Pavel as a co-maintainer

- Various cleanups (me, Thorsten)"

* tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux: (28 commits)
io_uring/net: check socket is valid in io_bind()/io_listen()
kernel: rerun task_work while freezing in get_signal()
io_uring/io-wq: limit retrying worker initialisation
io_uring/napi: Remove unnecessary s64 cast
io_uring/net: cleanup io_recv_finish() bundle handling
io_uring/msg_ring: fix overflow posting
MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer
io_uring/msg_ring: use kmem_cache_free() to free request
io_uring/msg_ring: check for dead submitter task
io_uring/msg_ring: add an alloc cache for io_kiocb entries
io_uring/msg_ring: improve handling of target CQE posting
io_uring: add io_add_aux_cqe() helper
io_uring: add remote task_work execution helper
io_uring/msg_ring: tighten requirement for remote posting
io_uring: Allocate only necessary memory in io_probe
io_uring: Fix probe of disabled operations
io_uring: Introduce IORING_OP_LISTEN
io_uring: Introduce IORING_OP_BIND
net: Split a __sys_listen helper for io_uring
net: Split a __sys_bind helper for io_uring
...

Linus Torvalds 2 years ago 3a56e241 4f5e249e

+547 -319

23 changed files

expand all

MAINTAINERS

include

linux

io_uring_types.h

socket.h

uapi

linux

io_uring.h

io_uring

Makefile

advise.c

eventfd.c

eventfd.h

io-wq.c

io-wq.h

io_uring.c

io_uring.h

msg_ring.c

msg_ring.h

napi.c

net.c

net.h

opdef.c

opdef.h

rsrc.c

kernel

signal.c

net

socket.c

+1 -1

MAINTAINERS

··· 11551 11551 11552 11552 IO_URING 11553 11553 M: Jens Axboe <axboe@kernel.dk> 11554 - R: Pavel Begunkov <asml.silence@gmail.com> 11554 + M: Pavel Begunkov <asml.silence@gmail.com> 11555 11555 L: io-uring@vger.kernel.org 11556 11556 S: Maintained 11557 11557 T: git git://git.kernel.dk/linux-block

+4 -10

include/linux/io_uring_types.h

··· 50 50 51 51 struct io_wq_work { 52 52 struct io_wq_work_node list; 53 - unsigned flags; 53 + atomic_t flags; 54 54 /* place it here instead of io_kiocb as it fills padding and saves 4B */ 55 55 int cancel_seq; 56 56 }; ··· 210 210 struct blk_plug plug; 211 211 }; 212 212 213 - struct io_ev_fd { 214 - struct eventfd_ctx *cq_ev_fd; 215 - unsigned int eventfd_async: 1; 216 - struct rcu_head rcu; 217 - atomic_t refs; 218 - atomic_t ops; 219 - }; 220 - 221 213 struct io_alloc_cache { 222 214 void **entries; 223 215 unsigned int nr_cached; ··· 364 372 struct io_restriction restrictions; 365 373 366 374 /* slow path rsrc auxilary data, used by update/register */ 367 - struct io_mapped_ubuf *dummy_ubuf; 368 375 struct io_rsrc_data *file_data; 369 376 struct io_rsrc_data *buf_data; 370 377 ··· 395 404 396 405 struct callback_head poll_wq_task_work; 397 406 struct list_head defer_list; 407 + 408 + struct io_alloc_cache msg_cache; 409 + spinlock_t msg_lock; 398 410 399 411 #ifdef CONFIG_NET_RX_BUSY_POLL 400 412 struct list_head napi_list; /* track busy poll napi_id */

include/linux/socket.h

··· 442 442 extern int __sys_socket(int family, int type, int protocol); 443 443 extern struct file *__sys_socket_file(int family, int type, int protocol); 444 444 extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); 445 + extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, 446 + int addrlen); 445 447 extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, 446 448 int addrlen, int file_flags); 447 449 extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, 448 450 int addrlen); 449 451 extern int __sys_listen(int fd, int backlog); 452 + extern int __sys_listen_socket(struct socket *sock, int backlog); 450 453 extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, 451 454 int __user *usockaddr_len); 452 455 extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,

include/uapi/linux/io_uring.h

··· 257 257 IORING_OP_FUTEX_WAITV, 258 258 IORING_OP_FIXED_FD_INSTALL, 259 259 IORING_OP_FTRUNCATE, 260 + IORING_OP_BIND, 261 + IORING_OP_LISTEN, 260 262 261 263 /* this goes last, obviously */ 262 264 IORING_OP_LAST,

+3 -3

io_uring/Makefile

··· 4 4 5 5 obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ 6 6 tctx.o filetable.o rw.o net.o poll.o \ 7 - uring_cmd.o openclose.o sqpoll.o \ 8 - xattr.o nop.o fs.o splice.o sync.o \ 9 - msg_ring.o advise.o openclose.o \ 7 + eventfd.o uring_cmd.o openclose.o \ 8 + sqpoll.o xattr.o nop.o fs.o splice.o \ 9 + sync.o msg_ring.o advise.o openclose.o \ 10 10 epoll.o statx.o timeout.o fdinfo.o \ 11 11 cancel.o waitid.o register.o \ 12 12 truncate.o memmap.o

+10 -6

io_uring/advise.c

··· 17 17 struct io_fadvise { 18 18 struct file *file; 19 19 u64 offset; 20 - u32 len; 20 + u64 len; 21 21 u32 advice; 22 22 }; 23 23 24 24 struct io_madvise { 25 25 struct file *file; 26 26 u64 addr; 27 - u32 len; 27 + u64 len; 28 28 u32 advice; 29 29 }; 30 30 ··· 33 33 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 34 34 struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); 35 35 36 - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) 36 + if (sqe->buf_index || sqe->splice_fd_in) 37 37 return -EINVAL; 38 38 39 39 ma->addr = READ_ONCE(sqe->addr); 40 - ma->len = READ_ONCE(sqe->len); 40 + ma->len = READ_ONCE(sqe->off); 41 + if (!ma->len) 42 + ma->len = READ_ONCE(sqe->len); 41 43 ma->advice = READ_ONCE(sqe->fadvise_advice); 42 44 req->flags |= REQ_F_FORCE_ASYNC; 43 45 return 0; ··· 80 78 { 81 79 struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); 82 80 83 - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) 81 + if (sqe->buf_index || sqe->splice_fd_in) 84 82 return -EINVAL; 85 83 86 84 fa->offset = READ_ONCE(sqe->off); 87 - fa->len = READ_ONCE(sqe->len); 85 + fa->len = READ_ONCE(sqe->addr); 86 + if (!fa->len) 87 + fa->len = READ_ONCE(sqe->len); 88 88 fa->advice = READ_ONCE(sqe->fadvise_advice); 89 89 if (io_fadvise_force_async(fa)) 90 90 req->flags |= REQ_F_FORCE_ASYNC;

+160

io_uring/eventfd.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <linux/kernel.h> 3 + #include <linux/errno.h> 4 + #include <linux/mm.h> 5 + #include <linux/slab.h> 6 + #include <linux/eventfd.h> 7 + #include <linux/eventpoll.h> 8 + #include <linux/io_uring.h> 9 + #include <linux/io_uring_types.h> 10 + 11 + #include "io-wq.h" 12 + #include "eventfd.h" 13 + 14 + struct io_ev_fd { 15 + struct eventfd_ctx *cq_ev_fd; 16 + unsigned int eventfd_async: 1; 17 + struct rcu_head rcu; 18 + atomic_t refs; 19 + atomic_t ops; 20 + }; 21 + 22 + enum { 23 + IO_EVENTFD_OP_SIGNAL_BIT, 24 + }; 25 + 26 + static void io_eventfd_free(struct rcu_head *rcu) 27 + { 28 + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 29 + 30 + eventfd_ctx_put(ev_fd->cq_ev_fd); 31 + kfree(ev_fd); 32 + } 33 + 34 + static void io_eventfd_do_signal(struct rcu_head *rcu) 35 + { 36 + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 37 + 38 + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); 39 + 40 + if (atomic_dec_and_test(&ev_fd->refs)) 41 + io_eventfd_free(rcu); 42 + } 43 + 44 + void io_eventfd_signal(struct io_ring_ctx *ctx) 45 + { 46 + struct io_ev_fd *ev_fd = NULL; 47 + 48 + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 49 + return; 50 + 51 + guard(rcu)(); 52 + 53 + /* 54 + * rcu_dereference ctx->io_ev_fd once and use it for both for checking 55 + * and eventfd_signal 56 + */ 57 + ev_fd = rcu_dereference(ctx->io_ev_fd); 58 + 59 + /* 60 + * Check again if ev_fd exists incase an io_eventfd_unregister call 61 + * completed between the NULL check of ctx->io_ev_fd at the start of 62 + * the function and rcu_read_lock. 63 + */ 64 + if (unlikely(!ev_fd)) 65 + return; 66 + if (!atomic_inc_not_zero(&ev_fd->refs)) 67 + return; 68 + if (ev_fd->eventfd_async && !io_wq_current_is_worker()) 69 + goto out; 70 + 71 + if (likely(eventfd_signal_allowed())) { 72 + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); 73 + } else { 74 + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { 75 + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); 76 + return; 77 + } 78 + } 79 + out: 80 + if (atomic_dec_and_test(&ev_fd->refs)) 81 + call_rcu(&ev_fd->rcu, io_eventfd_free); 82 + } 83 + 84 + void io_eventfd_flush_signal(struct io_ring_ctx *ctx) 85 + { 86 + bool skip; 87 + 88 + spin_lock(&ctx->completion_lock); 89 + 90 + /* 91 + * Eventfd should only get triggered when at least one event has been 92 + * posted. Some applications rely on the eventfd notification count 93 + * only changing IFF a new CQE has been added to the CQ ring. There's 94 + * no depedency on 1:1 relationship between how many times this 95 + * function is called (and hence the eventfd count) and number of CQEs 96 + * posted to the CQ ring. 97 + */ 98 + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; 99 + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 100 + spin_unlock(&ctx->completion_lock); 101 + if (skip) 102 + return; 103 + 104 + io_eventfd_signal(ctx); 105 + } 106 + 107 + int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 108 + unsigned int eventfd_async) 109 + { 110 + struct io_ev_fd *ev_fd; 111 + __s32 __user *fds = arg; 112 + int fd; 113 + 114 + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 115 + lockdep_is_held(&ctx->uring_lock)); 116 + if (ev_fd) 117 + return -EBUSY; 118 + 119 + if (copy_from_user(&fd, fds, sizeof(*fds))) 120 + return -EFAULT; 121 + 122 + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 123 + if (!ev_fd) 124 + return -ENOMEM; 125 + 126 + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 127 + if (IS_ERR(ev_fd->cq_ev_fd)) { 128 + int ret = PTR_ERR(ev_fd->cq_ev_fd); 129 + kfree(ev_fd); 130 + return ret; 131 + } 132 + 133 + spin_lock(&ctx->completion_lock); 134 + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 135 + spin_unlock(&ctx->completion_lock); 136 + 137 + ev_fd->eventfd_async = eventfd_async; 138 + ctx->has_evfd = true; 139 + atomic_set(&ev_fd->refs, 1); 140 + atomic_set(&ev_fd->ops, 0); 141 + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 142 + return 0; 143 + } 144 + 145 + int io_eventfd_unregister(struct io_ring_ctx *ctx) 146 + { 147 + struct io_ev_fd *ev_fd; 148 + 149 + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 150 + lockdep_is_held(&ctx->uring_lock)); 151 + if (ev_fd) { 152 + ctx->has_evfd = false; 153 + rcu_assign_pointer(ctx->io_ev_fd, NULL); 154 + if (atomic_dec_and_test(&ev_fd->refs)) 155 + call_rcu(&ev_fd->rcu, io_eventfd_free); 156 + return 0; 157 + } 158 + 159 + return -ENXIO; 160 + }

io_uring/eventfd.h

··· 1 + 2 + struct io_ring_ctx; 3 + int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 4 + unsigned int eventfd_async); 5 + int io_eventfd_unregister(struct io_ring_ctx *ctx); 6 + 7 + void io_eventfd_flush_signal(struct io_ring_ctx *ctx); 8 + void io_eventfd_signal(struct io_ring_ctx *ctx);

+17 -12

io_uring/io-wq.c

··· 23 23 #include "io_uring.h" 24 24 25 25 #define WORKER_IDLE_TIMEOUT (5 * HZ) 26 + #define WORKER_INIT_LIMIT 3 26 27 27 28 enum { 28 29 IO_WORKER_F_UP = 0, /* up and active */ ··· 59 58 60 59 unsigned long create_state; 61 60 struct callback_head create_work; 61 + int init_retries; 62 62 63 63 union { 64 64 struct rcu_head rcu; ··· 161 159 static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, 162 160 struct io_wq_work *work) 163 161 { 164 - return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); 162 + return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); 165 163 } 166 164 167 165 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) ··· 453 451 454 452 static inline unsigned int io_get_work_hash(struct io_wq_work *work) 455 453 { 456 - return work->flags >> IO_WQ_HASH_SHIFT; 454 + return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; 457 455 } 458 456 459 457 static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) ··· 594 592 595 593 next_hashed = wq_next_work(work); 596 594 597 - if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) 598 - work->flags |= IO_WQ_WORK_CANCEL; 595 + if (do_kill && 596 + (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) 597 + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 599 598 wq->do_work(work); 600 599 io_assign_current_work(worker, NULL); 601 600 ··· 747 744 return true; 748 745 } 749 746 750 - static inline bool io_should_retry_thread(long err) 747 + static inline bool io_should_retry_thread(struct io_worker *worker, long err) 751 748 { 752 749 /* 753 750 * Prevent perpetual task_work retry, if the task (or its group) is 754 751 * exiting. 755 752 */ 756 753 if (fatal_signal_pending(current)) 754 + return false; 755 + if (worker->init_retries++ >= WORKER_INIT_LIMIT) 757 756 return false; 758 757 759 758 switch (err) { ··· 783 778 io_init_new_worker(wq, worker, tsk); 784 779 io_worker_release(worker); 785 780 return; 786 - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { 781 + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 787 782 struct io_wq_acct *acct = io_wq_get_acct(worker); 788 783 789 784 atomic_dec(&acct->nr_running); ··· 850 845 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 851 846 if (!IS_ERR(tsk)) { 852 847 io_init_new_worker(wq, worker, tsk); 853 - } else if (!io_should_retry_thread(PTR_ERR(tsk))) { 848 + } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 854 849 kfree(worker); 855 850 goto fail; 856 851 } else { ··· 896 891 static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) 897 892 { 898 893 do { 899 - work->flags |= IO_WQ_WORK_CANCEL; 894 + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 900 895 wq->do_work(work); 901 896 work = wq->free_work(work); 902 897 } while (work); ··· 931 926 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) 932 927 { 933 928 struct io_wq_acct *acct = io_work_get_acct(wq, work); 934 - unsigned long work_flags = work->flags; 929 + unsigned int work_flags = atomic_read(&work->flags); 935 930 struct io_cb_cancel_data match = { 936 931 .fn = io_wq_work_match_item, 937 932 .data = work, ··· 944 939 * been marked as one that should not get executed, cancel it here. 945 940 */ 946 941 if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || 947 - (work->flags & IO_WQ_WORK_CANCEL)) { 942 + (work_flags & IO_WQ_WORK_CANCEL)) { 948 943 io_run_cancel(work, wq); 949 944 return; 950 945 } ··· 987 982 unsigned int bit; 988 983 989 984 bit = hash_ptr(val, IO_WQ_HASH_ORDER); 990 - work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); 985 + atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags); 991 986 } 992 987 993 988 static bool __io_wq_worker_cancel(struct io_worker *worker, ··· 995 990 struct io_wq_work *work) 996 991 { 997 992 if (work && match->fn(work, match->data)) { 998 - work->flags |= IO_WQ_WORK_CANCEL; 993 + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 999 994 __set_notify_signal(worker->task); 1000 995 return true; 1001 996 }

+1 -1

io_uring/io-wq.h

··· 56 56 57 57 static inline bool io_wq_is_hashed(struct io_wq_work *work) 58 58 { 59 - return work->flags & IO_WQ_WORK_HASHED; 59 + return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; 60 60 } 61 61 62 62 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

+57 -97

io_uring/io_uring.c

··· 95 95 #include "futex.h" 96 96 #include "napi.h" 97 97 #include "uring_cmd.h" 98 + #include "msg_ring.h" 98 99 #include "memmap.h" 99 100 100 101 #include "timeout.h" 101 102 #include "poll.h" 102 103 #include "rw.h" 103 104 #include "alloc_cache.h" 105 + #include "eventfd.h" 104 106 105 107 #define IORING_MAX_ENTRIES 32768 106 108 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) ··· 316 314 sizeof(struct io_async_rw)); 317 315 ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, 318 316 sizeof(struct uring_cache)); 317 + spin_lock_init(&ctx->msg_lock); 318 + ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, 319 + sizeof(struct io_kiocb)); 319 320 ret |= io_futex_cache_init(ctx); 320 321 if (ret) 321 322 goto err; ··· 355 350 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 356 351 io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 357 352 io_alloc_cache_free(&ctx->uring_cache, kfree); 353 + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); 358 354 io_futex_cache_free(ctx); 359 355 kfree(ctx->cancel_table.hbs); 360 356 kfree(ctx->cancel_table_locked.hbs); ··· 467 461 } 468 462 469 463 req->work.list.next = NULL; 470 - req->work.flags = 0; 464 + atomic_set(&req->work.flags, 0); 471 465 if (req->flags & REQ_F_FORCE_ASYNC) 472 - req->work.flags |= IO_WQ_WORK_CONCURRENT; 466 + atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags); 473 467 474 468 if (req->file && !(req->flags & REQ_F_FIXED_FILE)) 475 469 req->flags |= io_file_get_flags(req->file); ··· 485 479 io_wq_hash_work(&req->work, file_inode(req->file)); 486 480 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 487 481 if (def->unbound_nonreg_file) 488 - req->work.flags |= IO_WQ_WORK_UNBOUND; 482 + atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags); 489 483 } 490 484 } 491 485 ··· 525 519 * worker for it). 526 520 */ 527 521 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 528 - req->work.flags |= IO_WQ_WORK_CANCEL; 522 + atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); 529 523 530 524 trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); 531 525 io_wq_enqueue(tctx->io_wq, &req->work); ··· 545 539 io_req_task_queue(de->req); 546 540 kfree(de); 547 541 } 548 - } 549 - 550 - void io_eventfd_ops(struct rcu_head *rcu) 551 - { 552 - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 553 - int ops = atomic_xchg(&ev_fd->ops, 0); 554 - 555 - if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) 556 - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); 557 - 558 - /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback 559 - * ordering in a race but if references are 0 we know we have to free 560 - * it regardless. 561 - */ 562 - if (atomic_dec_and_test(&ev_fd->refs)) { 563 - eventfd_ctx_put(ev_fd->cq_ev_fd); 564 - kfree(ev_fd); 565 - } 566 - } 567 - 568 - static void io_eventfd_signal(struct io_ring_ctx *ctx) 569 - { 570 - struct io_ev_fd *ev_fd = NULL; 571 - 572 - rcu_read_lock(); 573 - /* 574 - * rcu_dereference ctx->io_ev_fd once and use it for both for checking 575 - * and eventfd_signal 576 - */ 577 - ev_fd = rcu_dereference(ctx->io_ev_fd); 578 - 579 - /* 580 - * Check again if ev_fd exists incase an io_eventfd_unregister call 581 - * completed between the NULL check of ctx->io_ev_fd at the start of 582 - * the function and rcu_read_lock. 583 - */ 584 - if (unlikely(!ev_fd)) 585 - goto out; 586 - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 587 - goto out; 588 - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) 589 - goto out; 590 - 591 - if (likely(eventfd_signal_allowed())) { 592 - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); 593 - } else { 594 - atomic_inc(&ev_fd->refs); 595 - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) 596 - call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); 597 - else 598 - atomic_dec(&ev_fd->refs); 599 - } 600 - 601 - out: 602 - rcu_read_unlock(); 603 - } 604 - 605 - static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) 606 - { 607 - bool skip; 608 - 609 - spin_lock(&ctx->completion_lock); 610 - 611 - /* 612 - * Eventfd should only get triggered when at least one event has been 613 - * posted. Some applications rely on the eventfd notification count 614 - * only changing IFF a new CQE has been added to the CQ ring. There's 615 - * no depedency on 1:1 relationship between how many times this 616 - * function is called (and hence the eventfd count) and number of CQEs 617 - * posted to the CQ ring. 618 - */ 619 - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; 620 - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 621 - spin_unlock(&ctx->completion_lock); 622 - if (skip) 623 - return; 624 - 625 - io_eventfd_signal(ctx); 626 542 } 627 543 628 544 void __io_commit_cqring_flush(struct io_ring_ctx *ctx) ··· 806 878 return false; 807 879 } 808 880 881 + static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, 882 + u32 cflags) 883 + { 884 + bool filled; 885 + 886 + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); 887 + if (!filled) 888 + filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); 889 + 890 + return filled; 891 + } 892 + 809 893 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) 810 894 { 811 895 bool filled; 812 896 813 897 io_cq_lock(ctx); 814 - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); 815 - if (!filled) 816 - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); 817 - 898 + filled = __io_post_aux_cqe(ctx, user_data, res, cflags); 818 899 io_cq_unlock_post(ctx); 819 900 return filled; 901 + } 902 + 903 + /* 904 + * Must be called from inline task_work so we now a flush will happen later, 905 + * and obviously with ctx->uring_lock held (tw always has that). 906 + */ 907 + void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) 908 + { 909 + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { 910 + spin_lock(&ctx->completion_lock); 911 + io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); 912 + spin_unlock(&ctx->completion_lock); 913 + } 914 + ctx->submit_state.cq_flush = true; 820 915 } 821 916 822 917 /* ··· 1126 1175 WARN_ON_ONCE(ret); 1127 1176 } 1128 1177 1129 - static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 1178 + static inline void io_req_local_work_add(struct io_kiocb *req, 1179 + struct io_ring_ctx *ctx, 1180 + unsigned flags) 1130 1181 { 1131 - struct io_ring_ctx *ctx = req->ctx; 1132 1182 unsigned nr_wait, nr_tw, nr_tw_prev; 1133 1183 struct llist_node *head; 1134 1184 ··· 1142 1190 */ 1143 1191 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 1144 1192 flags &= ~IOU_F_TWQ_LAZY_WAKE; 1193 + 1194 + guard(rcu)(); 1145 1195 1146 1196 head = READ_ONCE(ctx->work_llist.first); 1147 1197 do { ··· 1226 1272 1227 1273 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) 1228 1274 { 1229 - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 1230 - rcu_read_lock(); 1231 - io_req_local_work_add(req, flags); 1232 - rcu_read_unlock(); 1233 - } else { 1275 + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) 1276 + io_req_local_work_add(req, req->ctx, flags); 1277 + else 1234 1278 io_req_normal_work_add(req); 1235 - } 1279 + } 1280 + 1281 + void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, 1282 + unsigned flags) 1283 + { 1284 + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) 1285 + return; 1286 + io_req_local_work_add(req, ctx, flags); 1236 1287 } 1237 1288 1238 1289 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) ··· 1426 1467 } 1427 1468 __io_cq_unlock_post(ctx); 1428 1469 1429 - if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { 1470 + if (!wq_list_empty(&state->compl_reqs)) { 1430 1471 io_free_batch_list(ctx, state->compl_reqs.first); 1431 1472 INIT_WQ_LIST(&state->compl_reqs); 1432 1473 } ··· 1772 1813 io_arm_ltimeout(req); 1773 1814 1774 1815 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 1775 - if (work->flags & IO_WQ_WORK_CANCEL) { 1816 + if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { 1776 1817 fail: 1777 1818 io_req_task_queue_fail(req, err); 1778 1819 return; 1779 1820 } 1780 1821 if (!io_assign_file(req, def, issue_flags)) { 1781 1822 err = -EBADF; 1782 - work->flags |= IO_WQ_WORK_CANCEL; 1823 + atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 1783 1824 goto fail; 1784 1825 } 1785 1826 ··· 2608 2649 io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 2609 2650 io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 2610 2651 io_alloc_cache_free(&ctx->uring_cache, kfree); 2652 + io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); 2611 2653 io_futex_cache_free(ctx); 2612 2654 io_destroy_buffers(ctx); 2613 2655 mutex_unlock(&ctx->uring_lock);

+3 -6

io_uring/io_uring.h

··· 65 65 int io_run_task_work_sig(struct io_ring_ctx *ctx); 66 66 void io_req_defer_failed(struct io_kiocb *req, s32 res); 67 67 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 68 + void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 68 69 bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); 69 70 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 70 71 ··· 74 73 unsigned issue_flags); 75 74 76 75 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 76 + void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, 77 + unsigned flags); 77 78 bool io_alloc_async_data(struct io_kiocb *req); 78 79 void io_req_task_queue(struct io_kiocb *req); 79 80 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); ··· 107 104 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 108 105 bool cancel_all); 109 106 110 - enum { 111 - IO_EVENTFD_OP_SIGNAL_BIT, 112 - IO_EVENTFD_OP_FREE_BIT, 113 - }; 114 - 115 - void io_eventfd_ops(struct rcu_head *rcu); 116 107 void io_activate_pollwq(struct io_ring_ctx *ctx); 117 108 118 109 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)

+84 -50

io_uring/msg_ring.c

··· 11 11 #include "io_uring.h" 12 12 #include "rsrc.h" 13 13 #include "filetable.h" 14 + #include "alloc_cache.h" 14 15 #include "msg_ring.h" 15 - 16 16 17 17 /* All valid masks for MSG_RING */ 18 18 #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ ··· 68 68 69 69 static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) 70 70 { 71 - if (!target_ctx->task_complete) 72 - return false; 73 - return current != target_ctx->submitter_task; 71 + return target_ctx->task_complete; 74 72 } 75 73 76 - static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) 74 + static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) 77 75 { 78 - struct io_ring_ctx *ctx = req->file->private_data; 79 - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); 80 - struct task_struct *task = READ_ONCE(ctx->submitter_task); 76 + struct io_ring_ctx *ctx = req->ctx; 81 77 82 - if (unlikely(!task)) 83 - return -EOWNERDEAD; 84 - 85 - init_task_work(&msg->tw, func); 86 - if (task_work_add(task, &msg->tw, TWA_SIGNAL)) 87 - return -EOWNERDEAD; 88 - 89 - return IOU_ISSUE_SKIP_COMPLETE; 90 - } 91 - 92 - static void io_msg_tw_complete(struct callback_head *head) 93 - { 94 - struct io_msg *msg = container_of(head, struct io_msg, tw); 95 - struct io_kiocb *req = cmd_to_io_kiocb(msg); 96 - struct io_ring_ctx *target_ctx = req->file->private_data; 97 - int ret = 0; 98 - 99 - if (current->flags & PF_EXITING) { 100 - ret = -EOWNERDEAD; 101 - } else { 102 - u32 flags = 0; 103 - 104 - if (msg->flags & IORING_MSG_RING_FLAGS_PASS) 105 - flags = msg->cqe_flags; 106 - 107 - /* 108 - * If the target ring is using IOPOLL mode, then we need to be 109 - * holding the uring_lock for posting completions. Other ring 110 - * types rely on the regular completion locking, which is 111 - * handled while posting. 112 - */ 113 - if (target_ctx->flags & IORING_SETUP_IOPOLL) 114 - mutex_lock(&target_ctx->uring_lock); 115 - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) 116 - ret = -EOVERFLOW; 117 - if (target_ctx->flags & IORING_SETUP_IOPOLL) 118 - mutex_unlock(&target_ctx->uring_lock); 78 + io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); 79 + if (spin_trylock(&ctx->msg_lock)) { 80 + if (io_alloc_cache_put(&ctx->msg_cache, req)) 81 + req = NULL; 82 + spin_unlock(&ctx->msg_lock); 119 83 } 84 + if (req) 85 + kmem_cache_free(req_cachep, req); 86 + percpu_ref_put(&ctx->refs); 87 + } 120 88 121 - if (ret < 0) 122 - req_set_fail(req); 123 - io_req_queue_tw_complete(req, ret); 89 + static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, 90 + int res, u32 cflags, u64 user_data) 91 + { 92 + req->task = READ_ONCE(ctx->submitter_task); 93 + if (!req->task) { 94 + kmem_cache_free(req_cachep, req); 95 + return -EOWNERDEAD; 96 + } 97 + req->cqe.user_data = user_data; 98 + io_req_set_res(req, res, cflags); 99 + percpu_ref_get(&ctx->refs); 100 + req->ctx = ctx; 101 + req->io_task_work.func = io_msg_tw_complete; 102 + io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); 103 + return 0; 104 + } 105 + 106 + static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) 107 + { 108 + struct io_kiocb *req = NULL; 109 + 110 + if (spin_trylock(&ctx->msg_lock)) { 111 + req = io_alloc_cache_get(&ctx->msg_cache); 112 + spin_unlock(&ctx->msg_lock); 113 + } 114 + if (req) 115 + return req; 116 + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); 117 + } 118 + 119 + static int io_msg_data_remote(struct io_kiocb *req) 120 + { 121 + struct io_ring_ctx *target_ctx = req->file->private_data; 122 + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); 123 + struct io_kiocb *target; 124 + u32 flags = 0; 125 + 126 + target = io_msg_get_kiocb(req->ctx); 127 + if (unlikely(!target)) 128 + return -ENOMEM; 129 + 130 + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) 131 + flags = msg->cqe_flags; 132 + 133 + return io_msg_remote_post(target_ctx, target, msg->len, flags, 134 + msg->user_data); 124 135 } 125 136 126 137 static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) ··· 149 138 return -EBADFD; 150 139 151 140 if (io_msg_need_remote(target_ctx)) 152 - return io_msg_exec_remote(req, io_msg_tw_complete); 141 + return io_msg_data_remote(req); 153 142 154 143 if (msg->flags & IORING_MSG_RING_FLAGS_PASS) 155 144 flags = msg->cqe_flags; ··· 229 218 io_req_queue_tw_complete(req, ret); 230 219 } 231 220 221 + static int io_msg_fd_remote(struct io_kiocb *req) 222 + { 223 + struct io_ring_ctx *ctx = req->file->private_data; 224 + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); 225 + struct task_struct *task = READ_ONCE(ctx->submitter_task); 226 + 227 + if (unlikely(!task)) 228 + return -EOWNERDEAD; 229 + 230 + init_task_work(&msg->tw, io_msg_tw_fd_complete); 231 + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) 232 + return -EOWNERDEAD; 233 + 234 + return IOU_ISSUE_SKIP_COMPLETE; 235 + } 236 + 232 237 static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) 233 238 { 234 239 struct io_ring_ctx *target_ctx = req->file->private_data; ··· 267 240 } 268 241 269 242 if (io_msg_need_remote(target_ctx)) 270 - return io_msg_exec_remote(req, io_msg_tw_fd_complete); 243 + return io_msg_fd_remote(req); 271 244 return io_msg_install_complete(req, issue_flags); 272 245 } 273 246 ··· 320 293 } 321 294 io_req_set_res(req, ret, 0); 322 295 return IOU_OK; 296 + } 297 + 298 + void io_msg_cache_free(const void *entry) 299 + { 300 + struct io_kiocb *req = (struct io_kiocb *) entry; 301 + 302 + kmem_cache_free(req_cachep, req); 323 303 }

io_uring/msg_ring.h

··· 3 3 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 4 4 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); 5 5 void io_msg_ring_cleanup(struct io_kiocb *req); 6 + void io_msg_cache_free(const void *entry);

+1 -1

io_uring/napi.c

··· 283 283 s64 poll_to_ns = timespec64_to_ns(ts); 284 284 if (poll_to_ns > 0) { 285 285 u64 val = poll_to_ns + 999; 286 - do_div(val, (s64) 1000); 286 + do_div(val, 1000); 287 287 poll_to = val; 288 288 } 289 289 }

+84 -10

io_uring/net.c

··· 51 51 bool seen_econnaborted; 52 52 }; 53 53 54 + struct io_bind { 55 + struct file *file; 56 + int addr_len; 57 + }; 58 + 59 + struct io_listen { 60 + struct file *file; 61 + int backlog; 62 + }; 63 + 54 64 struct io_sr_msg { 55 65 struct file *file; 56 66 union { ··· 827 817 bool mshot_finished, unsigned issue_flags) 828 818 { 829 819 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 830 - unsigned int cflags; 831 - 832 - if (sr->flags & IORING_RECVSEND_BUNDLE) 833 - cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), 834 - issue_flags); 835 - else 836 - cflags = io_put_kbuf(req, issue_flags); 820 + unsigned int cflags = 0; 837 821 838 822 if (kmsg->msg.msg_inq > 0) 839 823 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 840 824 841 - /* bundle with no more immediate buffers, we're done */ 842 - if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) 843 - goto finish; 825 + if (sr->flags & IORING_RECVSEND_BUNDLE) { 826 + cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), 827 + issue_flags); 828 + /* bundle with no more immediate buffers, we're done */ 829 + if (req->flags & REQ_F_BL_EMPTY) 830 + goto finish; 831 + } else { 832 + cflags |= io_put_kbuf(req, issue_flags); 833 + } 844 834 845 835 /* 846 836 * Fill CQE for this receive and see if we should keep trying to ··· 1725 1715 io_req_msg_cleanup(req, issue_flags); 1726 1716 io_req_set_res(req, ret, 0); 1727 1717 return IOU_OK; 1718 + } 1719 + 1720 + int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1721 + { 1722 + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1723 + struct sockaddr __user *uaddr; 1724 + struct io_async_msghdr *io; 1725 + 1726 + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1727 + return -EINVAL; 1728 + 1729 + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1730 + bind->addr_len = READ_ONCE(sqe->addr2); 1731 + 1732 + io = io_msg_alloc_async(req); 1733 + if (unlikely(!io)) 1734 + return -ENOMEM; 1735 + return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); 1736 + } 1737 + 1738 + int io_bind(struct io_kiocb *req, unsigned int issue_flags) 1739 + { 1740 + struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1741 + struct io_async_msghdr *io = req->async_data; 1742 + struct socket *sock; 1743 + int ret; 1744 + 1745 + sock = sock_from_file(req->file); 1746 + if (unlikely(!sock)) 1747 + return -ENOTSOCK; 1748 + 1749 + ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); 1750 + if (ret < 0) 1751 + req_set_fail(req); 1752 + io_req_set_res(req, ret, 0); 1753 + return 0; 1754 + } 1755 + 1756 + int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1757 + { 1758 + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1759 + 1760 + if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) 1761 + return -EINVAL; 1762 + 1763 + listen->backlog = READ_ONCE(sqe->len); 1764 + return 0; 1765 + } 1766 + 1767 + int io_listen(struct io_kiocb *req, unsigned int issue_flags) 1768 + { 1769 + struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1770 + struct socket *sock; 1771 + int ret; 1772 + 1773 + sock = sock_from_file(req->file); 1774 + if (unlikely(!sock)) 1775 + return -ENOTSOCK; 1776 + 1777 + ret = __sys_listen_socket(sock, listen->backlog); 1778 + if (ret < 0) 1779 + req_set_fail(req); 1780 + io_req_set_res(req, ret, 0); 1781 + return 0; 1728 1782 } 1729 1783 1730 1784 void io_netmsg_cache_free(const void *entry)

io_uring/net.h

··· 49 49 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 50 50 void io_send_zc_cleanup(struct io_kiocb *req); 51 51 52 + int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 53 + int io_bind(struct io_kiocb *req, unsigned int issue_flags); 54 + 55 + int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 56 + int io_listen(struct io_kiocb *req, unsigned int issue_flags); 57 + 52 58 void io_netmsg_cache_free(const void *entry); 53 59 #else 54 60 static inline void io_netmsg_cache_free(const void *entry)

+34

io_uring/opdef.c

··· 495 495 .prep = io_ftruncate_prep, 496 496 .issue = io_ftruncate, 497 497 }, 498 + [IORING_OP_BIND] = { 499 + #if defined(CONFIG_NET) 500 + .needs_file = 1, 501 + .prep = io_bind_prep, 502 + .issue = io_bind, 503 + .async_size = sizeof(struct io_async_msghdr), 504 + #else 505 + .prep = io_eopnotsupp_prep, 506 + #endif 507 + }, 508 + [IORING_OP_LISTEN] = { 509 + #if defined(CONFIG_NET) 510 + .needs_file = 1, 511 + .prep = io_listen_prep, 512 + .issue = io_listen, 513 + .async_size = sizeof(struct io_async_msghdr), 514 + #else 515 + .prep = io_eopnotsupp_prep, 516 + #endif 517 + }, 498 518 }; 499 519 500 520 const struct io_cold_def io_cold_defs[] = { ··· 736 716 [IORING_OP_FTRUNCATE] = { 737 717 .name = "FTRUNCATE", 738 718 }, 719 + [IORING_OP_BIND] = { 720 + .name = "BIND", 721 + }, 722 + [IORING_OP_LISTEN] = { 723 + .name = "LISTEN", 724 + }, 739 725 }; 740 726 741 727 const char *io_uring_get_opcode(u8 opcode) ··· 749 723 if (opcode < IORING_OP_LAST) 750 724 return io_cold_defs[opcode].name; 751 725 return "INVALID"; 726 + } 727 + 728 + bool io_uring_op_supported(u8 opcode) 729 + { 730 + if (opcode < IORING_OP_LAST && 731 + io_issue_defs[opcode].prep != io_eopnotsupp_prep) 732 + return true; 733 + return false; 752 734 } 753 735 754 736 void __init io_uring_optable_init(void)

+2 -2

io_uring/opdef.h

··· 17 17 unsigned poll_exclusive : 1; 18 18 /* op supports buffer selection */ 19 19 unsigned buffer_select : 1; 20 - /* opcode is not supported by this kernel */ 21 - unsigned not_supported : 1; 22 20 /* skip auditing */ 23 21 unsigned audit_skip : 1; 24 22 /* supports ioprio */ ··· 44 46 45 47 extern const struct io_issue_def io_issue_defs[]; 46 48 extern const struct io_cold_def io_cold_defs[]; 49 + 50 + bool io_uring_op_supported(u8 opcode); 47 51 48 52 void io_uring_optable_init(void); 49 53 #endif

+5 -60

io_uring/register.c

··· 27 27 #include "cancel.h" 28 28 #include "kbuf.h" 29 29 #include "napi.h" 30 + #include "eventfd.h" 30 31 31 32 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 32 33 IORING_REGISTER_LAST + IORING_OP_LAST) 33 - 34 - static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 35 - unsigned int eventfd_async) 36 - { 37 - struct io_ev_fd *ev_fd; 38 - __s32 __user *fds = arg; 39 - int fd; 40 - 41 - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 42 - lockdep_is_held(&ctx->uring_lock)); 43 - if (ev_fd) 44 - return -EBUSY; 45 - 46 - if (copy_from_user(&fd, fds, sizeof(*fds))) 47 - return -EFAULT; 48 - 49 - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 50 - if (!ev_fd) 51 - return -ENOMEM; 52 - 53 - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 54 - if (IS_ERR(ev_fd->cq_ev_fd)) { 55 - int ret = PTR_ERR(ev_fd->cq_ev_fd); 56 - kfree(ev_fd); 57 - return ret; 58 - } 59 - 60 - spin_lock(&ctx->completion_lock); 61 - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 62 - spin_unlock(&ctx->completion_lock); 63 - 64 - ev_fd->eventfd_async = eventfd_async; 65 - ctx->has_evfd = true; 66 - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 67 - atomic_set(&ev_fd->refs, 1); 68 - atomic_set(&ev_fd->ops, 0); 69 - return 0; 70 - } 71 - 72 - int io_eventfd_unregister(struct io_ring_ctx *ctx) 73 - { 74 - struct io_ev_fd *ev_fd; 75 - 76 - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 77 - lockdep_is_held(&ctx->uring_lock)); 78 - if (ev_fd) { 79 - ctx->has_evfd = false; 80 - rcu_assign_pointer(ctx->io_ev_fd, NULL); 81 - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 82 - call_rcu(&ev_fd->rcu, io_eventfd_ops); 83 - return 0; 84 - } 85 - 86 - return -ENXIO; 87 - } 88 34 89 35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 90 36 unsigned nr_args) ··· 39 93 size_t size; 40 94 int i, ret; 41 95 96 + if (nr_args > IORING_OP_LAST) 97 + nr_args = IORING_OP_LAST; 98 + 42 99 size = struct_size(p, ops, nr_args); 43 - if (size == SIZE_MAX) 44 - return -EOVERFLOW; 45 100 p = kzalloc(size, GFP_KERNEL); 46 101 if (!p) 47 102 return -ENOMEM; ··· 55 108 goto out; 56 109 57 110 p->last_op = IORING_OP_LAST - 1; 58 - if (nr_args > IORING_OP_LAST) 59 - nr_args = IORING_OP_LAST; 60 111 61 112 for (i = 0; i < nr_args; i++) { 62 113 p->ops[i].op = i; 63 - if (!io_issue_defs[i].not_supported) 114 + if (io_uring_op_supported(i)) 64 115 p->ops[i].flags = IO_URING_OP_SUPPORTED; 65 116 } 66 117 p->ops_len = i;

+23 -42

io_uring/rsrc.c

··· 85 85 return 0; 86 86 } 87 87 88 - static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 89 - void __user *arg, unsigned index) 90 - { 91 - struct iovec __user *src; 92 - 93 - #ifdef CONFIG_COMPAT 94 - if (ctx->compat) { 95 - struct compat_iovec __user *ciovs; 96 - struct compat_iovec ciov; 97 - 98 - ciovs = (struct compat_iovec __user *) arg; 99 - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 100 - return -EFAULT; 101 - 102 - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 103 - dst->iov_len = ciov.iov_len; 104 - return 0; 105 - } 106 - #endif 107 - src = (struct iovec __user *) arg; 108 - if (copy_from_user(dst, &src[index], sizeof(*dst))) 109 - return -EFAULT; 110 - return 0; 111 - } 112 - 113 88 static int io_buffer_validate(struct iovec *iov) 114 89 { 115 90 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); ··· 224 249 225 250 ret = io_run_task_work_sig(ctx); 226 251 if (ret < 0) { 227 - __set_current_state(TASK_RUNNING); 252 + finish_wait(&ctx->rsrc_quiesce_wq, &we); 228 253 mutex_lock(&ctx->uring_lock); 229 254 if (list_empty(&ctx->rsrc_ref_list)) 230 255 ret = 0; ··· 232 257 } 233 258 234 259 schedule(); 235 - __set_current_state(TASK_RUNNING); 236 260 mutex_lock(&ctx->uring_lock); 237 261 ret = 0; 238 262 } while (!list_empty(&ctx->rsrc_ref_list)); ··· 394 420 struct io_uring_rsrc_update2 *up, 395 421 unsigned int nr_args) 396 422 { 423 + struct iovec __user *uvec = u64_to_user_ptr(up->data); 397 424 u64 __user *tags = u64_to_user_ptr(up->tags); 398 - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 425 + struct iovec fast_iov, *iov; 399 426 struct page *last_hpage = NULL; 400 427 __u32 done; 401 428 int i, err; ··· 410 435 struct io_mapped_ubuf *imu; 411 436 u64 tag = 0; 412 437 413 - err = io_copy_iov(ctx, &iov, iovs, done); 414 - if (err) 438 + iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat); 439 + if (IS_ERR(iov)) { 440 + err = PTR_ERR(iov); 415 441 break; 442 + } 416 443 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 417 444 err = -EFAULT; 418 445 break; 419 446 } 420 - err = io_buffer_validate(&iov); 447 + err = io_buffer_validate(iov); 421 448 if (err) 422 449 break; 423 - if (!iov.iov_base && tag) { 450 + if (!iov->iov_base && tag) { 424 451 err = -EINVAL; 425 452 break; 426 453 } 427 - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 454 + err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); 428 455 if (err) 429 456 break; 430 457 ··· 948 971 { 949 972 struct page *last_hpage = NULL; 950 973 struct io_rsrc_data *data; 974 + struct iovec fast_iov, *iov = &fast_iov; 975 + const struct iovec __user *uvec = (struct iovec * __user) arg; 951 976 int i, ret; 952 - struct iovec iov; 953 977 954 978 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 955 979 ··· 967 989 return ret; 968 990 } 969 991 992 + if (!arg) 993 + memset(iov, 0, sizeof(*iov)); 994 + 970 995 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 971 996 if (arg) { 972 - ret = io_copy_iov(ctx, &iov, arg, i); 997 + iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat); 998 + if (IS_ERR(iov)) { 999 + ret = PTR_ERR(iov); 1000 + break; 1001 + } 1002 + ret = io_buffer_validate(iov); 973 1003 if (ret) 974 1004 break; 975 - ret = io_buffer_validate(&iov); 976 - if (ret) 977 - break; 978 - } else { 979 - memset(&iov, 0, sizeof(iov)); 980 1005 } 981 1006 982 - if (!iov.iov_base && *io_get_tag_slot(data, i)) { 1007 + if (!iov->iov_base && *io_get_tag_slot(data, i)) { 983 1008 ret = -EINVAL; 984 1009 break; 985 1010 } 986 1011 987 - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 1012 + ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], 988 1013 &last_hpage); 989 1014 if (ret) 990 1015 break;

kernel/signal.c

··· 2600 2600 spin_unlock_irq(&current->sighand->siglock); 2601 2601 cgroup_enter_frozen(); 2602 2602 schedule(); 2603 + 2604 + /* 2605 + * We could've been woken by task_work, run it to clear 2606 + * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. 2607 + */ 2608 + clear_notify_signal(); 2609 + if (unlikely(task_work_pending(current))) 2610 + task_work_run(); 2603 2611 } 2604 2612 2605 2613 static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)

+30 -18

net/socket.c

··· 1822 1822 return __sys_socketpair(family, type, protocol, usockvec); 1823 1823 } 1824 1824 1825 + int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, 1826 + int addrlen) 1827 + { 1828 + int err; 1829 + 1830 + err = security_socket_bind(sock, (struct sockaddr *)address, 1831 + addrlen); 1832 + if (!err) 1833 + err = READ_ONCE(sock->ops)->bind(sock, 1834 + (struct sockaddr *)address, 1835 + addrlen); 1836 + return err; 1837 + } 1838 + 1825 1839 /* 1826 1840 * Bind a name to a socket. Nothing much to do here since it's 1827 1841 * the protocol's responsibility to handle the local address. ··· 1853 1839 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1854 1840 if (sock) { 1855 1841 err = move_addr_to_kernel(umyaddr, addrlen, &address); 1856 - if (!err) { 1857 - err = security_socket_bind(sock, 1858 - (struct sockaddr *)&address, 1859 - addrlen); 1860 - if (!err) 1861 - err = READ_ONCE(sock->ops)->bind(sock, 1862 - (struct sockaddr *) 1863 - &address, addrlen); 1864 - } 1842 + if (!err) 1843 + err = __sys_bind_socket(sock, &address, addrlen); 1865 1844 fput_light(sock->file, fput_needed); 1866 1845 } 1867 1846 return err; ··· 1870 1863 * necessary for a listen, and if that works, we mark the socket as 1871 1864 * ready for listening. 1872 1865 */ 1866 + int __sys_listen_socket(struct socket *sock, int backlog) 1867 + { 1868 + int somaxconn, err; 1869 + 1870 + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); 1871 + if ((unsigned int)backlog > somaxconn) 1872 + backlog = somaxconn; 1873 + 1874 + err = security_socket_listen(sock, backlog); 1875 + if (!err) 1876 + err = READ_ONCE(sock->ops)->listen(sock, backlog); 1877 + return err; 1878 + } 1873 1879 1874 1880 int __sys_listen(int fd, int backlog) 1875 1881 { 1876 1882 struct socket *sock; 1877 1883 int err, fput_needed; 1878 - int somaxconn; 1879 1884 1880 1885 sock = sockfd_lookup_light(fd, &err, &fput_needed); 1881 1886 if (sock) { 1882 - somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); 1883 - if ((unsigned int)backlog > somaxconn) 1884 - backlog = somaxconn; 1885 - 1886 - err = security_socket_listen(sock, backlog); 1887 - if (!err) 1888 - err = READ_ONCE(sock->ops)->listen(sock, backlog); 1889 - 1887 + err = __sys_listen_socket(sock, backlog); 1890 1888 fput_light(sock->file, fput_needed); 1891 1889 } 1892 1890 return err;

Configure Feed

Configure Feed