Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-5.5-20191220' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
"Here's a set of fixes that should go into 5.5-rc3 for io_uring.

This is bigger than I'd like it to be, mainly because we're fixing the
case where an application reuses sqe data right after issue. This
really must work, or it's confusing. With 5.5 we're flagging us as
submit stable for the actual data, this must also be the case for
SQEs.

Honestly, I'd really like to add another series on top of this, since
it cleans it up considerable and prevents any SQE reuse by design. I
posted that here:

https://lore.kernel.org/io-uring/20191220174742.7449-1-axboe@kernel.dk/T/#u

and may still send it your way early next week once it's been looked
at and had some more soak time (does pass all regression tests). With
that series, we've unified the prep+issue handling, and only the prep
phase even has access to the SQE.

Anyway, outside of that, fixes in here for a few other issues that
have been hit in testing or production"

* tag 'io_uring-5.5-20191220' of git://git.kernel.dk/linux-block:
io_uring: io_wq_submit_work() should not touch req->rw
io_uring: don't wait when under-submitting
io_uring: warn about unhandled opcode
io_uring: read opcode and user_data from SQE exactly once
io_uring: make IORING_OP_TIMEOUT_REMOVE deferrable
io_uring: make IORING_OP_CANCEL_ASYNC deferrable
io_uring: make IORING_POLL_ADD and IORING_POLL_REMOVE deferrable
io_uring: make HARDLINK imply LINK
io_uring: any deferred command must have stable sqe data
io_uring: remove 'sqe' parameter to the OP helpers that take it
io_uring: fix pre-prepped issue with force_nonblock == true
io-wq: re-add io_wq_current_is_worker()
io_uring: fix sporadic -EFAULT from IORING_OP_RECVMSG
io_uring: fix stale comment and a few typos

+497 -233
+1 -1
fs/io-wq.c
··· 948 948 /* 949 949 * Now check if a free (going busy) or busy worker has the work 950 950 * currently running. If we find it there, we'll return CANCEL_RUNNING 951 - * as an indication that we attempte to signal cancellation. The 951 + * as an indication that we attempt to signal cancellation. The 952 952 * completion will run normally in this case. 953 953 */ 954 954 rcu_read_lock();
+6 -2
fs/io-wq.h
··· 120 120 static inline void io_wq_worker_running(struct task_struct *tsk) 121 121 { 122 122 } 123 - #endif /* CONFIG_IO_WQ */ 123 + #endif 124 124 125 - #endif /* INTERNAL_IO_WQ_H */ 125 + static inline bool io_wq_current_is_worker(void) 126 + { 127 + return in_task() && (current->flags & PF_IO_WORKER); 128 + } 129 + #endif
+490 -230
fs/io_uring.c
··· 289 289 */ 290 290 struct io_poll_iocb { 291 291 struct file *file; 292 - struct wait_queue_head *head; 292 + union { 293 + struct wait_queue_head *head; 294 + u64 addr; 295 + }; 293 296 __poll_t events; 294 297 bool done; 295 298 bool canceled; ··· 305 302 struct timespec64 ts; 306 303 enum hrtimer_mode mode; 307 304 u32 seq_offset; 305 + }; 306 + 307 + struct io_accept { 308 + struct file *file; 309 + struct sockaddr __user *addr; 310 + int __user *addr_len; 311 + int flags; 312 + }; 313 + 314 + struct io_sync { 315 + struct file *file; 316 + loff_t len; 317 + loff_t off; 318 + int flags; 319 + }; 320 + 321 + struct io_cancel { 322 + struct file *file; 323 + u64 addr; 324 + }; 325 + 326 + struct io_timeout { 327 + struct file *file; 328 + u64 addr; 329 + int flags; 308 330 }; 309 331 310 332 struct io_async_connect { ··· 371 343 struct file *file; 372 344 struct kiocb rw; 373 345 struct io_poll_iocb poll; 346 + struct io_accept accept; 347 + struct io_sync sync; 348 + struct io_cancel cancel; 349 + struct io_timeout timeout; 374 350 }; 375 351 376 352 const struct io_uring_sqe *sqe; ··· 384 352 bool has_user; 385 353 bool in_async; 386 354 bool needs_fixed_file; 355 + u8 opcode; 387 356 388 357 struct io_ring_ctx *ctx; 389 358 union { ··· 411 378 #define REQ_F_INFLIGHT 16384 /* on inflight list */ 412 379 #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ 413 380 #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ 381 + #define REQ_F_PREPPED 131072 /* request already opcode prepared */ 414 382 u64 user_data; 415 383 u32 result; 416 384 u32 sequence; ··· 598 564 } 599 565 } 600 566 601 - static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) 567 + static inline bool io_req_needs_user(struct io_kiocb *req) 602 568 { 603 - u8 opcode = READ_ONCE(sqe->opcode); 604 - 605 - return !(opcode == IORING_OP_READ_FIXED || 606 - opcode == IORING_OP_WRITE_FIXED); 569 + return !(req->opcode == IORING_OP_READ_FIXED || 570 + req->opcode == IORING_OP_WRITE_FIXED); 607 571 } 608 572 609 573 static inline bool io_prep_async_work(struct io_kiocb *req, ··· 610 578 bool do_hashed = false; 611 579 612 580 if (req->sqe) { 613 - switch (req->sqe->opcode) { 581 + switch (req->opcode) { 614 582 case IORING_OP_WRITEV: 615 583 case IORING_OP_WRITE_FIXED: 616 584 /* only regular files should be hashed for writes */ ··· 633 601 req->work.flags |= IO_WQ_WORK_UNBOUND; 634 602 break; 635 603 } 636 - if (io_sqe_needs_user(req->sqe)) 604 + if (io_req_needs_user(req)) 637 605 req->work.flags |= IO_WQ_WORK_NEEDS_USER; 638 606 } 639 607 ··· 1004 972 trace_io_uring_fail_link(req, link); 1005 973 1006 974 if ((req->flags & REQ_F_LINK_TIMEOUT) && 1007 - link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { 975 + link->opcode == IORING_OP_LINK_TIMEOUT) { 1008 976 io_link_cancel_timeout(link); 1009 977 } else { 1010 978 io_cqring_fill_event(link, -ECANCELED); ··· 1210 1178 } 1211 1179 1212 1180 /* 1213 - * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a 1181 + * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a 1214 1182 * non-spinning poll check - we'll still enter the driver poll loop, but only 1215 1183 * as a non-spinning completion check. 1216 1184 */ ··· 1647 1615 * for that purpose and instead let the caller pass in the read/write 1648 1616 * flag. 1649 1617 */ 1650 - opcode = READ_ONCE(sqe->opcode); 1618 + opcode = req->opcode; 1651 1619 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 1652 1620 *iovec = NULL; 1653 1621 return io_import_fixed(req->ctx, rw, sqe, iter); ··· 1733 1701 return ret; 1734 1702 } 1735 1703 1736 - static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, 1704 + static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, 1737 1705 struct iovec *iovec, struct iovec *fast_iov, 1738 1706 struct iov_iter *iter) 1739 1707 { ··· 1747 1715 } 1748 1716 } 1749 1717 1750 - static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, 1751 - struct iovec *iovec, struct iovec *fast_iov, 1752 - struct iov_iter *iter) 1718 + static int io_alloc_async_ctx(struct io_kiocb *req) 1753 1719 { 1754 1720 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); 1755 1721 if (req->io) { 1756 - io_req_map_io(req, io_size, iovec, fast_iov, iter); 1757 1722 memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); 1758 1723 req->sqe = &req->io->sqe; 1759 1724 return 0; 1760 1725 } 1761 1726 1762 - return -ENOMEM; 1727 + return 1; 1728 + } 1729 + 1730 + static void io_rw_async(struct io_wq_work **workptr) 1731 + { 1732 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 1733 + struct iovec *iov = NULL; 1734 + 1735 + if (req->io->rw.iov != req->io->rw.fast_iov) 1736 + iov = req->io->rw.iov; 1737 + io_wq_submit_work(workptr); 1738 + kfree(iov); 1739 + } 1740 + 1741 + static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, 1742 + struct iovec *iovec, struct iovec *fast_iov, 1743 + struct iov_iter *iter) 1744 + { 1745 + if (!req->io && io_alloc_async_ctx(req)) 1746 + return -ENOMEM; 1747 + 1748 + io_req_map_rw(req, io_size, iovec, fast_iov, iter); 1749 + req->work.func = io_rw_async; 1750 + return 0; 1763 1751 } 1764 1752 1765 1753 static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, ··· 1816 1764 if (ret < 0) 1817 1765 return ret; 1818 1766 } 1767 + 1768 + /* Ensure we clear previously set non-block flag */ 1769 + if (!force_nonblock) 1770 + req->rw.ki_flags &= ~IOCB_NOWAIT; 1819 1771 1820 1772 file = req->file; 1821 1773 io_size = ret; ··· 1862 1806 kiocb_done(kiocb, ret2, nxt, req->in_async); 1863 1807 } else { 1864 1808 copy_iov: 1865 - ret = io_setup_async_io(req, io_size, iovec, 1809 + ret = io_setup_async_rw(req, io_size, iovec, 1866 1810 inline_vecs, &iter); 1867 1811 if (ret) 1868 1812 goto out_free; ··· 1870 1814 } 1871 1815 } 1872 1816 out_free: 1873 - kfree(iovec); 1817 + if (!io_wq_current_is_worker()) 1818 + kfree(iovec); 1874 1819 return ret; 1875 1820 } 1876 1821 ··· 1909 1852 if (ret < 0) 1910 1853 return ret; 1911 1854 } 1855 + 1856 + /* Ensure we clear previously set non-block flag */ 1857 + if (!force_nonblock) 1858 + req->rw.ki_flags &= ~IOCB_NOWAIT; 1912 1859 1913 1860 file = kiocb->ki_filp; 1914 1861 io_size = ret; ··· 1961 1900 kiocb_done(kiocb, ret2, nxt, req->in_async); 1962 1901 } else { 1963 1902 copy_iov: 1964 - ret = io_setup_async_io(req, io_size, iovec, 1903 + ret = io_setup_async_rw(req, io_size, iovec, 1965 1904 inline_vecs, &iter); 1966 1905 if (ret) 1967 1906 goto out_free; ··· 1969 1908 } 1970 1909 } 1971 1910 out_free: 1972 - kfree(iovec); 1911 + if (!io_wq_current_is_worker()) 1912 + kfree(iovec); 1973 1913 return ret; 1974 1914 } 1975 1915 ··· 1989 1927 return 0; 1990 1928 } 1991 1929 1992 - static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1930 + static int io_prep_fsync(struct io_kiocb *req) 1993 1931 { 1932 + const struct io_uring_sqe *sqe = req->sqe; 1994 1933 struct io_ring_ctx *ctx = req->ctx; 1995 1934 1935 + if (req->flags & REQ_F_PREPPED) 1936 + return 0; 1996 1937 if (!req->file) 1997 1938 return -EBADF; 1998 1939 ··· 2004 1939 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) 2005 1940 return -EINVAL; 2006 1941 1942 + req->sync.flags = READ_ONCE(sqe->fsync_flags); 1943 + if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 1944 + return -EINVAL; 1945 + 1946 + req->sync.off = READ_ONCE(sqe->off); 1947 + req->sync.len = READ_ONCE(sqe->len); 1948 + req->flags |= REQ_F_PREPPED; 2007 1949 return 0; 2008 1950 } 2009 1951 2010 - static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2011 - struct io_kiocb **nxt, bool force_nonblock) 1952 + static bool io_req_cancelled(struct io_kiocb *req) 2012 1953 { 2013 - loff_t sqe_off = READ_ONCE(sqe->off); 2014 - loff_t sqe_len = READ_ONCE(sqe->len); 2015 - loff_t end = sqe_off + sqe_len; 2016 - unsigned fsync_flags; 1954 + if (req->work.flags & IO_WQ_WORK_CANCEL) { 1955 + req_set_fail_links(req); 1956 + io_cqring_add_event(req, -ECANCELED); 1957 + io_put_req(req); 1958 + return true; 1959 + } 1960 + 1961 + return false; 1962 + } 1963 + 1964 + static void io_fsync_finish(struct io_wq_work **workptr) 1965 + { 1966 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 1967 + loff_t end = req->sync.off + req->sync.len; 1968 + struct io_kiocb *nxt = NULL; 2017 1969 int ret; 2018 1970 2019 - fsync_flags = READ_ONCE(sqe->fsync_flags); 2020 - if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) 2021 - return -EINVAL; 1971 + if (io_req_cancelled(req)) 1972 + return; 2022 1973 2023 - ret = io_prep_fsync(req, sqe); 1974 + ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, 1975 + end > 0 ? end : LLONG_MAX, 1976 + req->sync.flags & IORING_FSYNC_DATASYNC); 1977 + if (ret < 0) 1978 + req_set_fail_links(req); 1979 + io_cqring_add_event(req, ret); 1980 + io_put_req_find_next(req, &nxt); 1981 + if (nxt) 1982 + *workptr = &nxt->work; 1983 + } 1984 + 1985 + static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, 1986 + bool force_nonblock) 1987 + { 1988 + struct io_wq_work *work, *old_work; 1989 + int ret; 1990 + 1991 + ret = io_prep_fsync(req); 2024 1992 if (ret) 2025 1993 return ret; 2026 1994 2027 1995 /* fsync always requires a blocking context */ 2028 - if (force_nonblock) 1996 + if (force_nonblock) { 1997 + io_put_req(req); 1998 + req->work.func = io_fsync_finish; 2029 1999 return -EAGAIN; 2000 + } 2030 2001 2031 - ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, 2032 - end > 0 ? end : LLONG_MAX, 2033 - fsync_flags & IORING_FSYNC_DATASYNC); 2034 - 2035 - if (ret < 0) 2036 - req_set_fail_links(req); 2037 - io_cqring_add_event(req, ret); 2038 - io_put_req_find_next(req, nxt); 2002 + work = old_work = &req->work; 2003 + io_fsync_finish(&work); 2004 + if (work && work != old_work) 2005 + *nxt = container_of(work, struct io_kiocb, work); 2039 2006 return 0; 2040 2007 } 2041 2008 2042 - static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2009 + static int io_prep_sfr(struct io_kiocb *req) 2043 2010 { 2011 + const struct io_uring_sqe *sqe = req->sqe; 2044 2012 struct io_ring_ctx *ctx = req->ctx; 2045 - int ret = 0; 2046 2013 2014 + if (req->flags & REQ_F_PREPPED) 2015 + return 0; 2047 2016 if (!req->file) 2048 2017 return -EBADF; 2049 2018 ··· 2086 1987 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) 2087 1988 return -EINVAL; 2088 1989 2089 - return ret; 1990 + req->sync.off = READ_ONCE(sqe->off); 1991 + req->sync.len = READ_ONCE(sqe->len); 1992 + req->sync.flags = READ_ONCE(sqe->sync_range_flags); 1993 + req->flags |= REQ_F_PREPPED; 1994 + return 0; 2090 1995 } 2091 1996 2092 - static int io_sync_file_range(struct io_kiocb *req, 2093 - const struct io_uring_sqe *sqe, 2094 - struct io_kiocb **nxt, 2095 - bool force_nonblock) 1997 + static void io_sync_file_range_finish(struct io_wq_work **workptr) 2096 1998 { 2097 - loff_t sqe_off; 2098 - loff_t sqe_len; 2099 - unsigned flags; 1999 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2000 + struct io_kiocb *nxt = NULL; 2100 2001 int ret; 2101 2002 2102 - ret = io_prep_sfr(req, sqe); 2003 + if (io_req_cancelled(req)) 2004 + return; 2005 + 2006 + ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, 2007 + req->sync.flags); 2008 + if (ret < 0) 2009 + req_set_fail_links(req); 2010 + io_cqring_add_event(req, ret); 2011 + io_put_req_find_next(req, &nxt); 2012 + if (nxt) 2013 + *workptr = &nxt->work; 2014 + } 2015 + 2016 + static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, 2017 + bool force_nonblock) 2018 + { 2019 + struct io_wq_work *work, *old_work; 2020 + int ret; 2021 + 2022 + ret = io_prep_sfr(req); 2103 2023 if (ret) 2104 2024 return ret; 2105 2025 2106 2026 /* sync_file_range always requires a blocking context */ 2107 - if (force_nonblock) 2027 + if (force_nonblock) { 2028 + io_put_req(req); 2029 + req->work.func = io_sync_file_range_finish; 2108 2030 return -EAGAIN; 2031 + } 2109 2032 2110 - sqe_off = READ_ONCE(sqe->off); 2111 - sqe_len = READ_ONCE(sqe->len); 2112 - flags = READ_ONCE(sqe->sync_range_flags); 2113 - 2114 - ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); 2115 - 2116 - if (ret < 0) 2117 - req_set_fail_links(req); 2118 - io_cqring_add_event(req, ret); 2119 - io_put_req_find_next(req, nxt); 2033 + work = old_work = &req->work; 2034 + io_sync_file_range_finish(&work); 2035 + if (work && work != old_work) 2036 + *nxt = container_of(work, struct io_kiocb, work); 2120 2037 return 0; 2121 2038 } 2039 + 2040 + #if defined(CONFIG_NET) 2041 + static void io_sendrecv_async(struct io_wq_work **workptr) 2042 + { 2043 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2044 + struct iovec *iov = NULL; 2045 + 2046 + if (req->io->rw.iov != req->io->rw.fast_iov) 2047 + iov = req->io->msg.iov; 2048 + io_wq_submit_work(workptr); 2049 + kfree(iov); 2050 + } 2051 + #endif 2122 2052 2123 2053 static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) 2124 2054 { ··· 2165 2037 #endif 2166 2038 } 2167 2039 2168 - static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2169 - struct io_kiocb **nxt, bool force_nonblock) 2040 + static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, 2041 + bool force_nonblock) 2170 2042 { 2171 2043 #if defined(CONFIG_NET) 2044 + const struct io_uring_sqe *sqe = req->sqe; 2045 + struct io_async_msghdr *kmsg = NULL; 2172 2046 struct socket *sock; 2173 2047 int ret; 2174 2048 ··· 2179 2049 2180 2050 sock = sock_from_file(req->file, &ret); 2181 2051 if (sock) { 2182 - struct io_async_ctx io, *copy; 2052 + struct io_async_ctx io; 2183 2053 struct sockaddr_storage addr; 2184 - struct msghdr *kmsg; 2185 2054 unsigned flags; 2186 2055 2187 2056 flags = READ_ONCE(sqe->msg_flags); ··· 2190 2061 flags |= MSG_DONTWAIT; 2191 2062 2192 2063 if (req->io) { 2193 - kmsg = &req->io->msg.msg; 2194 - kmsg->msg_name = &addr; 2064 + kmsg = &req->io->msg; 2065 + kmsg->msg.msg_name = &addr; 2066 + /* if iov is set, it's allocated already */ 2067 + if (!kmsg->iov) 2068 + kmsg->iov = kmsg->fast_iov; 2069 + kmsg->msg.msg_iter.iov = kmsg->iov; 2195 2070 } else { 2196 - kmsg = &io.msg.msg; 2197 - kmsg->msg_name = &addr; 2071 + kmsg = &io.msg; 2072 + kmsg->msg.msg_name = &addr; 2198 2073 ret = io_sendmsg_prep(req, &io); 2199 2074 if (ret) 2200 2075 goto out; 2201 2076 } 2202 2077 2203 - ret = __sys_sendmsg_sock(sock, kmsg, flags); 2078 + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 2204 2079 if (force_nonblock && ret == -EAGAIN) { 2205 - copy = kmalloc(sizeof(*copy), GFP_KERNEL); 2206 - if (!copy) { 2207 - ret = -ENOMEM; 2208 - goto out; 2209 - } 2210 - memcpy(&copy->msg, &io.msg, sizeof(copy->msg)); 2211 - req->io = copy; 2212 - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); 2213 - req->sqe = &req->io->sqe; 2214 - return ret; 2080 + if (req->io) 2081 + return -EAGAIN; 2082 + if (io_alloc_async_ctx(req)) 2083 + return -ENOMEM; 2084 + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); 2085 + req->work.func = io_sendrecv_async; 2086 + return -EAGAIN; 2215 2087 } 2216 2088 if (ret == -ERESTARTSYS) 2217 2089 ret = -EINTR; 2218 2090 } 2219 2091 2220 2092 out: 2093 + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) 2094 + kfree(kmsg->iov); 2221 2095 io_cqring_add_event(req, ret); 2222 2096 if (ret < 0) 2223 2097 req_set_fail_links(req); ··· 2248 2116 #endif 2249 2117 } 2250 2118 2251 - static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2252 - struct io_kiocb **nxt, bool force_nonblock) 2119 + static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, 2120 + bool force_nonblock) 2253 2121 { 2254 2122 #if defined(CONFIG_NET) 2123 + const struct io_uring_sqe *sqe = req->sqe; 2124 + struct io_async_msghdr *kmsg = NULL; 2255 2125 struct socket *sock; 2256 2126 int ret; 2257 2127 ··· 2263 2129 sock = sock_from_file(req->file, &ret); 2264 2130 if (sock) { 2265 2131 struct user_msghdr __user *msg; 2266 - struct io_async_ctx io, *copy; 2132 + struct io_async_ctx io; 2267 2133 struct sockaddr_storage addr; 2268 - struct msghdr *kmsg; 2269 2134 unsigned flags; 2270 2135 2271 2136 flags = READ_ONCE(sqe->msg_flags); ··· 2276 2143 msg = (struct user_msghdr __user *) (unsigned long) 2277 2144 READ_ONCE(sqe->addr); 2278 2145 if (req->io) { 2279 - kmsg = &req->io->msg.msg; 2280 - kmsg->msg_name = &addr; 2146 + kmsg = &req->io->msg; 2147 + kmsg->msg.msg_name = &addr; 2148 + /* if iov is set, it's allocated already */ 2149 + if (!kmsg->iov) 2150 + kmsg->iov = kmsg->fast_iov; 2151 + kmsg->msg.msg_iter.iov = kmsg->iov; 2281 2152 } else { 2282 - kmsg = &io.msg.msg; 2283 - kmsg->msg_name = &addr; 2153 + kmsg = &io.msg; 2154 + kmsg->msg.msg_name = &addr; 2284 2155 ret = io_recvmsg_prep(req, &io); 2285 2156 if (ret) 2286 2157 goto out; 2287 2158 } 2288 2159 2289 - ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); 2160 + ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); 2290 2161 if (force_nonblock && ret == -EAGAIN) { 2291 - copy = kmalloc(sizeof(*copy), GFP_KERNEL); 2292 - if (!copy) { 2293 - ret = -ENOMEM; 2294 - goto out; 2295 - } 2296 - memcpy(copy, &io, sizeof(*copy)); 2297 - req->io = copy; 2298 - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); 2299 - req->sqe = &req->io->sqe; 2300 - return ret; 2162 + if (req->io) 2163 + return -EAGAIN; 2164 + if (io_alloc_async_ctx(req)) 2165 + return -ENOMEM; 2166 + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); 2167 + req->work.func = io_sendrecv_async; 2168 + return -EAGAIN; 2301 2169 } 2302 2170 if (ret == -ERESTARTSYS) 2303 2171 ret = -EINTR; 2304 2172 } 2305 2173 2306 2174 out: 2175 + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) 2176 + kfree(kmsg->iov); 2307 2177 io_cqring_add_event(req, ret); 2308 2178 if (ret < 0) 2309 2179 req_set_fail_links(req); ··· 2317 2181 #endif 2318 2182 } 2319 2183 2320 - static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2321 - struct io_kiocb **nxt, bool force_nonblock) 2184 + static int io_accept_prep(struct io_kiocb *req) 2322 2185 { 2323 2186 #if defined(CONFIG_NET) 2324 - struct sockaddr __user *addr; 2325 - int __user *addr_len; 2326 - unsigned file_flags; 2327 - int flags, ret; 2187 + const struct io_uring_sqe *sqe = req->sqe; 2188 + struct io_accept *accept = &req->accept; 2189 + 2190 + if (req->flags & REQ_F_PREPPED) 2191 + return 0; 2328 2192 2329 2193 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 2330 2194 return -EINVAL; 2331 2195 if (sqe->ioprio || sqe->len || sqe->buf_index) 2332 2196 return -EINVAL; 2333 2197 2334 - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); 2335 - addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); 2336 - flags = READ_ONCE(sqe->accept_flags); 2337 - file_flags = force_nonblock ? O_NONBLOCK : 0; 2198 + accept->addr = (struct sockaddr __user *) 2199 + (unsigned long) READ_ONCE(sqe->addr); 2200 + accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); 2201 + accept->flags = READ_ONCE(sqe->accept_flags); 2202 + req->flags |= REQ_F_PREPPED; 2203 + return 0; 2204 + #else 2205 + return -EOPNOTSUPP; 2206 + #endif 2207 + } 2338 2208 2339 - ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); 2340 - if (ret == -EAGAIN && force_nonblock) { 2341 - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; 2209 + #if defined(CONFIG_NET) 2210 + static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, 2211 + bool force_nonblock) 2212 + { 2213 + struct io_accept *accept = &req->accept; 2214 + unsigned file_flags; 2215 + int ret; 2216 + 2217 + file_flags = force_nonblock ? O_NONBLOCK : 0; 2218 + ret = __sys_accept4_file(req->file, file_flags, accept->addr, 2219 + accept->addr_len, accept->flags); 2220 + if (ret == -EAGAIN && force_nonblock) 2342 2221 return -EAGAIN; 2343 - } 2344 2222 if (ret == -ERESTARTSYS) 2345 2223 ret = -EINTR; 2346 2224 if (ret < 0) 2347 2225 req_set_fail_links(req); 2348 2226 io_cqring_add_event(req, ret); 2349 2227 io_put_req_find_next(req, nxt); 2228 + return 0; 2229 + } 2230 + 2231 + static void io_accept_finish(struct io_wq_work **workptr) 2232 + { 2233 + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); 2234 + struct io_kiocb *nxt = NULL; 2235 + 2236 + if (io_req_cancelled(req)) 2237 + return; 2238 + __io_accept(req, &nxt, false); 2239 + if (nxt) 2240 + *workptr = &nxt->work; 2241 + } 2242 + #endif 2243 + 2244 + static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, 2245 + bool force_nonblock) 2246 + { 2247 + #if defined(CONFIG_NET) 2248 + int ret; 2249 + 2250 + ret = io_accept_prep(req); 2251 + if (ret) 2252 + return ret; 2253 + 2254 + ret = __io_accept(req, nxt, force_nonblock); 2255 + if (ret == -EAGAIN && force_nonblock) { 2256 + req->work.func = io_accept_finish; 2257 + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; 2258 + io_put_req(req); 2259 + return -EAGAIN; 2260 + } 2350 2261 return 0; 2351 2262 #else 2352 2263 return -EOPNOTSUPP; ··· 2415 2232 #endif 2416 2233 } 2417 2234 2418 - static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2419 - struct io_kiocb **nxt, bool force_nonblock) 2235 + static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, 2236 + bool force_nonblock) 2420 2237 { 2421 2238 #if defined(CONFIG_NET) 2239 + const struct io_uring_sqe *sqe = req->sqe; 2422 2240 struct io_async_ctx __io, *io; 2423 2241 unsigned file_flags; 2424 2242 int addr_len, ret; ··· 2444 2260 ret = __sys_connect_file(req->file, &io->connect.address, addr_len, 2445 2261 file_flags); 2446 2262 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 2447 - io = kmalloc(sizeof(*io), GFP_KERNEL); 2448 - if (!io) { 2263 + if (req->io) 2264 + return -EAGAIN; 2265 + if (io_alloc_async_ctx(req)) { 2449 2266 ret = -ENOMEM; 2450 2267 goto out; 2451 2268 } 2452 - memcpy(&io->connect, &__io.connect, sizeof(io->connect)); 2453 - req->io = io; 2454 - memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); 2455 - req->sqe = &io->sqe; 2269 + memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); 2456 2270 return -EAGAIN; 2457 2271 } 2458 2272 if (ret == -ERESTARTSYS) ··· 2513 2331 return -ENOENT; 2514 2332 } 2515 2333 2516 - /* 2517 - * Find a running poll command that matches one specified in sqe->addr, 2518 - * and remove it if found. 2519 - */ 2520 - static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2334 + static int io_poll_remove_prep(struct io_kiocb *req) 2521 2335 { 2522 - struct io_ring_ctx *ctx = req->ctx; 2523 - int ret; 2336 + const struct io_uring_sqe *sqe = req->sqe; 2524 2337 2338 + if (req->flags & REQ_F_PREPPED) 2339 + return 0; 2525 2340 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 2526 2341 return -EINVAL; 2527 2342 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 2528 2343 sqe->poll_events) 2529 2344 return -EINVAL; 2530 2345 2346 + req->poll.addr = READ_ONCE(sqe->addr); 2347 + req->flags |= REQ_F_PREPPED; 2348 + return 0; 2349 + } 2350 + 2351 + /* 2352 + * Find a running poll command that matches one specified in sqe->addr, 2353 + * and remove it if found. 2354 + */ 2355 + static int io_poll_remove(struct io_kiocb *req) 2356 + { 2357 + struct io_ring_ctx *ctx = req->ctx; 2358 + u64 addr; 2359 + int ret; 2360 + 2361 + ret = io_poll_remove_prep(req); 2362 + if (ret) 2363 + return ret; 2364 + 2365 + addr = req->poll.addr; 2531 2366 spin_lock_irq(&ctx->completion_lock); 2532 - ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); 2367 + ret = io_poll_cancel(ctx, addr); 2533 2368 spin_unlock_irq(&ctx->completion_lock); 2534 2369 2535 2370 io_cqring_add_event(req, ret); ··· 2681 2482 hlist_add_head(&req->hash_node, list); 2682 2483 } 2683 2484 2684 - static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2685 - struct io_kiocb **nxt) 2485 + static int io_poll_add_prep(struct io_kiocb *req) 2686 2486 { 2487 + const struct io_uring_sqe *sqe = req->sqe; 2687 2488 struct io_poll_iocb *poll = &req->poll; 2688 - struct io_ring_ctx *ctx = req->ctx; 2689 - struct io_poll_table ipt; 2690 - bool cancel = false; 2691 - __poll_t mask; 2692 2489 u16 events; 2693 2490 2491 + if (req->flags & REQ_F_PREPPED) 2492 + return 0; 2694 2493 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 2695 2494 return -EINVAL; 2696 2495 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) ··· 2696 2499 if (!poll->file) 2697 2500 return -EBADF; 2698 2501 2699 - req->io = NULL; 2700 - INIT_IO_WORK(&req->work, io_poll_complete_work); 2502 + req->flags |= REQ_F_PREPPED; 2701 2503 events = READ_ONCE(sqe->poll_events); 2702 2504 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 2505 + return 0; 2506 + } 2507 + 2508 + static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) 2509 + { 2510 + struct io_poll_iocb *poll = &req->poll; 2511 + struct io_ring_ctx *ctx = req->ctx; 2512 + struct io_poll_table ipt; 2513 + bool cancel = false; 2514 + __poll_t mask; 2515 + int ret; 2516 + 2517 + ret = io_poll_add_prep(req); 2518 + if (ret) 2519 + return ret; 2520 + 2521 + INIT_IO_WORK(&req->work, io_poll_complete_work); 2703 2522 INIT_HLIST_NODE(&req->hash_node); 2704 2523 2705 2524 poll->head = NULL; ··· 2786 2573 2787 2574 /* 2788 2575 * Adjust the reqs sequence before the current one because it 2789 - * will consume a slot in the cq_ring and the the cq_tail 2576 + * will consume a slot in the cq_ring and the cq_tail 2790 2577 * pointer will be increased, otherwise other timeout reqs may 2791 2578 * return in advance without waiting for enough wait_nr. 2792 2579 */ ··· 2832 2619 return 0; 2833 2620 } 2834 2621 2835 - /* 2836 - * Remove or update an existing timeout command 2837 - */ 2838 - static int io_timeout_remove(struct io_kiocb *req, 2839 - const struct io_uring_sqe *sqe) 2622 + static int io_timeout_remove_prep(struct io_kiocb *req) 2840 2623 { 2841 - struct io_ring_ctx *ctx = req->ctx; 2842 - unsigned flags; 2843 - int ret; 2624 + const struct io_uring_sqe *sqe = req->sqe; 2844 2625 2845 - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 2626 + if (req->flags & REQ_F_PREPPED) 2627 + return 0; 2628 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 2846 2629 return -EINVAL; 2847 2630 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) 2848 2631 return -EINVAL; 2849 - flags = READ_ONCE(sqe->timeout_flags); 2850 - if (flags) 2632 + 2633 + req->timeout.addr = READ_ONCE(sqe->addr); 2634 + req->timeout.flags = READ_ONCE(sqe->timeout_flags); 2635 + if (req->timeout.flags) 2851 2636 return -EINVAL; 2852 2637 2638 + req->flags |= REQ_F_PREPPED; 2639 + return 0; 2640 + } 2641 + 2642 + /* 2643 + * Remove or update an existing timeout command 2644 + */ 2645 + static int io_timeout_remove(struct io_kiocb *req) 2646 + { 2647 + struct io_ring_ctx *ctx = req->ctx; 2648 + int ret; 2649 + 2650 + ret = io_timeout_remove_prep(req); 2651 + if (ret) 2652 + return ret; 2653 + 2853 2654 spin_lock_irq(&ctx->completion_lock); 2854 - ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); 2655 + ret = io_timeout_cancel(ctx, req->timeout.addr); 2855 2656 2856 2657 io_cqring_fill_event(req, ret); 2857 2658 io_commit_cqring(ctx); ··· 2907 2680 data->mode = HRTIMER_MODE_REL; 2908 2681 2909 2682 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); 2910 - req->io = io; 2911 2683 return 0; 2912 2684 } 2913 2685 2914 - static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2686 + static int io_timeout(struct io_kiocb *req) 2915 2687 { 2688 + const struct io_uring_sqe *sqe = req->sqe; 2916 2689 unsigned count; 2917 2690 struct io_ring_ctx *ctx = req->ctx; 2918 2691 struct io_timeout_data *data; 2919 - struct io_async_ctx *io; 2920 2692 struct list_head *entry; 2921 2693 unsigned span = 0; 2694 + int ret; 2922 2695 2923 - io = req->io; 2924 - if (!io) { 2925 - int ret; 2926 - 2927 - io = kmalloc(sizeof(*io), GFP_KERNEL); 2928 - if (!io) 2696 + if (!req->io) { 2697 + if (io_alloc_async_ctx(req)) 2929 2698 return -ENOMEM; 2930 - ret = io_timeout_prep(req, io, false); 2931 - if (ret) { 2932 - kfree(io); 2699 + ret = io_timeout_prep(req, req->io, false); 2700 + if (ret) 2933 2701 return ret; 2934 - } 2935 2702 } 2936 2703 data = &req->io->timeout; 2937 2704 ··· 3052 2831 io_put_req_find_next(req, nxt); 3053 2832 } 3054 2833 3055 - static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, 3056 - struct io_kiocb **nxt) 2834 + static int io_async_cancel_prep(struct io_kiocb *req) 3057 2835 { 3058 - struct io_ring_ctx *ctx = req->ctx; 2836 + const struct io_uring_sqe *sqe = req->sqe; 3059 2837 3060 - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 2838 + if (req->flags & REQ_F_PREPPED) 2839 + return 0; 2840 + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3061 2841 return -EINVAL; 3062 2842 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || 3063 2843 sqe->cancel_flags) 3064 2844 return -EINVAL; 3065 2845 3066 - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); 2846 + req->flags |= REQ_F_PREPPED; 2847 + req->cancel.addr = READ_ONCE(sqe->addr); 3067 2848 return 0; 3068 2849 } 3069 2850 3070 - static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) 2851 + static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) 2852 + { 2853 + struct io_ring_ctx *ctx = req->ctx; 2854 + int ret; 2855 + 2856 + ret = io_async_cancel_prep(req); 2857 + if (ret) 2858 + return ret; 2859 + 2860 + io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); 2861 + return 0; 2862 + } 2863 + 2864 + static int io_req_defer_prep(struct io_kiocb *req) 3071 2865 { 3072 2866 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 2867 + struct io_async_ctx *io = req->io; 3073 2868 struct iov_iter iter; 3074 - ssize_t ret; 2869 + ssize_t ret = 0; 3075 2870 3076 - memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); 3077 - req->sqe = &io->sqe; 3078 - 3079 - switch (io->sqe.opcode) { 2871 + switch (req->opcode) { 2872 + case IORING_OP_NOP: 2873 + break; 3080 2874 case IORING_OP_READV: 3081 2875 case IORING_OP_READ_FIXED: 2876 + /* ensure prep does right import */ 2877 + req->io = NULL; 3082 2878 ret = io_read_prep(req, &iovec, &iter, true); 2879 + req->io = io; 2880 + if (ret < 0) 2881 + break; 2882 + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); 2883 + ret = 0; 3083 2884 break; 3084 2885 case IORING_OP_WRITEV: 3085 2886 case IORING_OP_WRITE_FIXED: 2887 + /* ensure prep does right import */ 2888 + req->io = NULL; 3086 2889 ret = io_write_prep(req, &iovec, &iter, true); 2890 + req->io = io; 2891 + if (ret < 0) 2892 + break; 2893 + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); 2894 + ret = 0; 2895 + break; 2896 + case IORING_OP_POLL_ADD: 2897 + ret = io_poll_add_prep(req); 2898 + break; 2899 + case IORING_OP_POLL_REMOVE: 2900 + ret = io_poll_remove_prep(req); 2901 + break; 2902 + case IORING_OP_FSYNC: 2903 + ret = io_prep_fsync(req); 2904 + break; 2905 + case IORING_OP_SYNC_FILE_RANGE: 2906 + ret = io_prep_sfr(req); 3087 2907 break; 3088 2908 case IORING_OP_SENDMSG: 3089 2909 ret = io_sendmsg_prep(req, io); ··· 3136 2874 ret = io_connect_prep(req, io); 3137 2875 break; 3138 2876 case IORING_OP_TIMEOUT: 3139 - return io_timeout_prep(req, io, false); 2877 + ret = io_timeout_prep(req, io, false); 2878 + break; 2879 + case IORING_OP_TIMEOUT_REMOVE: 2880 + ret = io_timeout_remove_prep(req); 2881 + break; 2882 + case IORING_OP_ASYNC_CANCEL: 2883 + ret = io_async_cancel_prep(req); 2884 + break; 3140 2885 case IORING_OP_LINK_TIMEOUT: 3141 - return io_timeout_prep(req, io, true); 2886 + ret = io_timeout_prep(req, io, true); 2887 + break; 2888 + case IORING_OP_ACCEPT: 2889 + ret = io_accept_prep(req); 2890 + break; 3142 2891 default: 3143 - req->io = io; 3144 - return 0; 2892 + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 2893 + req->opcode); 2894 + ret = -EINVAL; 2895 + break; 3145 2896 } 3146 2897 3147 - if (ret < 0) 3148 - return ret; 3149 - 3150 - req->io = io; 3151 - io_req_map_io(req, ret, iovec, inline_vecs, &iter); 3152 - return 0; 2898 + return ret; 3153 2899 } 3154 2900 3155 2901 static int io_req_defer(struct io_kiocb *req) 3156 2902 { 3157 2903 struct io_ring_ctx *ctx = req->ctx; 3158 - struct io_async_ctx *io; 3159 2904 int ret; 3160 2905 3161 2906 /* Still need defer if there is pending req in defer list. */ 3162 2907 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) 3163 2908 return 0; 3164 2909 3165 - io = kmalloc(sizeof(*io), GFP_KERNEL); 3166 - if (!io) 2910 + if (io_alloc_async_ctx(req)) 3167 2911 return -EAGAIN; 3168 2912 3169 - ret = io_req_defer_prep(req, io); 3170 - if (ret < 0) { 3171 - kfree(io); 2913 + ret = io_req_defer_prep(req); 2914 + if (ret < 0) 3172 2915 return ret; 3173 - } 3174 2916 3175 2917 spin_lock_irq(&ctx->completion_lock); 3176 2918 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { ··· 3192 2926 static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, 3193 2927 bool force_nonblock) 3194 2928 { 3195 - int ret, opcode; 3196 2929 struct io_ring_ctx *ctx = req->ctx; 2930 + int ret; 3197 2931 3198 - opcode = READ_ONCE(req->sqe->opcode); 3199 - switch (opcode) { 2932 + switch (req->opcode) { 3200 2933 case IORING_OP_NOP: 3201 2934 ret = io_nop(req); 3202 2935 break; ··· 3216 2951 ret = io_write(req, nxt, force_nonblock); 3217 2952 break; 3218 2953 case IORING_OP_FSYNC: 3219 - ret = io_fsync(req, req->sqe, nxt, force_nonblock); 2954 + ret = io_fsync(req, nxt, force_nonblock); 3220 2955 break; 3221 2956 case IORING_OP_POLL_ADD: 3222 - ret = io_poll_add(req, req->sqe, nxt); 2957 + ret = io_poll_add(req, nxt); 3223 2958 break; 3224 2959 case IORING_OP_POLL_REMOVE: 3225 - ret = io_poll_remove(req, req->sqe); 2960 + ret = io_poll_remove(req); 3226 2961 break; 3227 2962 case IORING_OP_SYNC_FILE_RANGE: 3228 - ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); 2963 + ret = io_sync_file_range(req, nxt, force_nonblock); 3229 2964 break; 3230 2965 case IORING_OP_SENDMSG: 3231 - ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); 2966 + ret = io_sendmsg(req, nxt, force_nonblock); 3232 2967 break; 3233 2968 case IORING_OP_RECVMSG: 3234 - ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); 2969 + ret = io_recvmsg(req, nxt, force_nonblock); 3235 2970 break; 3236 2971 case IORING_OP_TIMEOUT: 3237 - ret = io_timeout(req, req->sqe); 2972 + ret = io_timeout(req); 3238 2973 break; 3239 2974 case IORING_OP_TIMEOUT_REMOVE: 3240 - ret = io_timeout_remove(req, req->sqe); 2975 + ret = io_timeout_remove(req); 3241 2976 break; 3242 2977 case IORING_OP_ACCEPT: 3243 - ret = io_accept(req, req->sqe, nxt, force_nonblock); 2978 + ret = io_accept(req, nxt, force_nonblock); 3244 2979 break; 3245 2980 case IORING_OP_CONNECT: 3246 - ret = io_connect(req, req->sqe, nxt, force_nonblock); 2981 + ret = io_connect(req, nxt, force_nonblock); 3247 2982 break; 3248 2983 case IORING_OP_ASYNC_CANCEL: 3249 - ret = io_async_cancel(req, req->sqe, nxt); 2984 + ret = io_async_cancel(req, nxt); 3250 2985 break; 3251 2986 default: 3252 2987 ret = -EINVAL; ··· 3281 3016 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 3282 3017 struct io_kiocb *nxt = NULL; 3283 3018 int ret = 0; 3284 - 3285 - /* Ensure we clear previously set non-block flag */ 3286 - req->rw.ki_flags &= ~IOCB_NOWAIT; 3287 3019 3288 3020 if (work->flags & IO_WQ_WORK_CANCEL) 3289 3021 ret = -ECANCELED; ··· 3329 3067 return op >= IORING_OP_NOP && op < IORING_OP_LAST; 3330 3068 } 3331 3069 3332 - static int io_op_needs_file(const struct io_uring_sqe *sqe) 3070 + static int io_req_needs_file(struct io_kiocb *req) 3333 3071 { 3334 - int op = READ_ONCE(sqe->opcode); 3335 - 3336 - switch (op) { 3072 + switch (req->opcode) { 3337 3073 case IORING_OP_NOP: 3338 3074 case IORING_OP_POLL_REMOVE: 3339 3075 case IORING_OP_TIMEOUT: ··· 3340 3080 case IORING_OP_LINK_TIMEOUT: 3341 3081 return 0; 3342 3082 default: 3343 - if (io_req_op_valid(op)) 3083 + if (io_req_op_valid(req->opcode)) 3344 3084 return 1; 3345 3085 return -EINVAL; 3346 3086 } ··· 3367 3107 if (flags & IOSQE_IO_DRAIN) 3368 3108 req->flags |= REQ_F_IO_DRAIN; 3369 3109 3370 - ret = io_op_needs_file(req->sqe); 3110 + ret = io_req_needs_file(req); 3371 3111 if (ret <= 0) 3372 3112 return ret; 3373 3113 ··· 3487 3227 3488 3228 nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, 3489 3229 link_list); 3490 - if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) 3230 + if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) 3491 3231 return NULL; 3492 3232 3493 3233 req->flags |= REQ_F_LINK_TIMEOUT; ··· 3589 3329 struct io_ring_ctx *ctx = req->ctx; 3590 3330 int ret; 3591 3331 3592 - req->user_data = req->sqe->user_data; 3593 - 3594 3332 /* enforce forwards compatibility on users */ 3595 3333 if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { 3596 3334 ret = -EINVAL; ··· 3612 3354 */ 3613 3355 if (*link) { 3614 3356 struct io_kiocb *prev = *link; 3615 - struct io_async_ctx *io; 3616 3357 3617 3358 if (req->sqe->flags & IOSQE_IO_DRAIN) 3618 3359 (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; ··· 3619 3362 if (req->sqe->flags & IOSQE_IO_HARDLINK) 3620 3363 req->flags |= REQ_F_HARDLINK; 3621 3364 3622 - io = kmalloc(sizeof(*io), GFP_KERNEL); 3623 - if (!io) { 3365 + if (io_alloc_async_ctx(req)) { 3624 3366 ret = -EAGAIN; 3625 3367 goto err_req; 3626 3368 } 3627 3369 3628 - ret = io_req_defer_prep(req, io); 3370 + ret = io_req_defer_prep(req); 3629 3371 if (ret) { 3630 - kfree(io); 3631 3372 /* fail even hard links since we don't submit */ 3632 3373 prev->flags |= REQ_F_FAIL_LINK; 3633 3374 goto err_req; ··· 3685 3430 } 3686 3431 3687 3432 /* 3688 - * Fetch an sqe, if one is available. Note that s->sqe will point to memory 3433 + * Fetch an sqe, if one is available. Note that req->sqe will point to memory 3689 3434 * that is mapped by userspace. This means that care needs to be taken to 3690 3435 * ensure that reads are stable, as we cannot rely on userspace always 3691 3436 * being a good citizen. If members of the sqe are validated and then later ··· 3720 3465 */ 3721 3466 req->sequence = ctx->cached_sq_head; 3722 3467 req->sqe = &ctx->sq_sqes[head]; 3468 + req->opcode = READ_ONCE(req->sqe->opcode); 3469 + req->user_data = READ_ONCE(req->sqe->user_data); 3723 3470 ctx->cached_sq_head++; 3724 3471 return true; 3725 3472 } ··· 3767 3510 break; 3768 3511 } 3769 3512 3770 - if (io_sqe_needs_user(req->sqe) && !*mm) { 3513 + if (io_req_needs_user(req) && !*mm) { 3771 3514 mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); 3772 3515 if (!mm_fault) { 3773 3516 use_mm(ctx->sqo_mm); ··· 3783 3526 req->has_user = *mm != NULL; 3784 3527 req->in_async = async; 3785 3528 req->needs_fixed_file = async; 3786 - trace_io_uring_submit_sqe(ctx, req->sqe->user_data, 3787 - true, async); 3529 + trace_io_uring_submit_sqe(ctx, req->user_data, true, async); 3788 3530 if (!io_submit_sqe(req, statep, &link)) 3789 3531 break; 3790 3532 /* 3791 3533 * If previous wasn't linked and we have a linked command, 3792 3534 * that's the end of the chain. Submit the previous link. 3793 3535 */ 3794 - if (!(sqe_flags & IOSQE_IO_LINK) && link) { 3536 + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { 3795 3537 io_queue_link_head(link); 3796 3538 link = NULL; 3797 3539 } ··· 3950 3694 struct io_ring_ctx *ctx = iowq->ctx; 3951 3695 3952 3696 /* 3953 - * Wake up if we have enough events, or if a timeout occured since we 3697 + * Wake up if we have enough events, or if a timeout occurred since we 3954 3698 * started waiting. For timeouts, we always want to return to userspace, 3955 3699 * regardless of event count. 3956 3700 */ ··· 5140 4884 submitted = io_submit_sqes(ctx, to_submit, f.file, fd, 5141 4885 &cur_mm, false); 5142 4886 mutex_unlock(&ctx->uring_lock); 4887 + 4888 + if (submitted != to_submit) 4889 + goto out; 5143 4890 } 5144 4891 if (flags & IORING_ENTER_GETEVENTS) { 5145 4892 unsigned nr_events = 0; ··· 5156 4897 } 5157 4898 } 5158 4899 4900 + out: 5159 4901 percpu_ref_put(&ctx->refs); 5160 4902 out_fput: 5161 4903 fdput(f);