Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-5.5-20191226' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

- Removal of now unused busy wqe list (Hillf)

- Add cond_resched() to io-wq work processing (Hillf)

- And then the series that I hinted at from last week, which removes
the sqe from the io_kiocb and keeps all sqe handling on the prep
side. This guarantees that an opcode can't do the wrong thing and
read the sqe more than once. This is unchanged from last week, no
issues have been observed with this in testing. Hence I really think
we should fold this into 5.5.

* tag 'io_uring-5.5-20191226' of git://git.kernel.dk/linux-block:
io-wq: add cond_resched() to worker thread
io-wq: remove unused busy list from io_sqe
io_uring: pass in 'sqe' to the prep handlers
io_uring: standardize the prep methods
io_uring: read 'count' for IORING_OP_TIMEOUT in prep handler
io_uring: move all prep state for IORING_OP_{SEND,RECV}_MGS to prep handler
io_uring: move all prep state for IORING_OP_CONNECT to prep handler
io_uring: add and use struct io_rw for read/writes
io_uring: use u64_to_user_ptr() consistently

+357 -343
+2 -8
fs/io-wq.c
··· 92 92 struct io_wqe_acct acct[2]; 93 93 94 94 struct hlist_nulls_head free_list; 95 - struct hlist_nulls_head busy_list; 96 95 struct list_head all_list; 97 96 98 97 struct io_wq *wq; ··· 326 327 if (worker->flags & IO_WORKER_F_FREE) { 327 328 worker->flags &= ~IO_WORKER_F_FREE; 328 329 hlist_nulls_del_init_rcu(&worker->nulls_node); 329 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list); 330 330 } 331 331 332 332 /* ··· 363 365 { 364 366 if (!(worker->flags & IO_WORKER_F_FREE)) { 365 367 worker->flags |= IO_WORKER_F_FREE; 366 - hlist_nulls_del_init_rcu(&worker->nulls_node); 367 368 hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); 368 369 } 369 370 ··· 428 431 /* flush any pending signals before assigning new work */ 429 432 if (signal_pending(current)) 430 433 flush_signals(current); 434 + 435 + cond_resched(); 431 436 432 437 spin_lock_irq(&worker->lock); 433 438 worker->cur_work = work; ··· 797 798 798 799 set_bit(IO_WQ_BIT_CANCEL, &wq->state); 799 800 800 - /* 801 - * Browse both lists, as there's a gap between handing work off 802 - * to a worker and the worker putting itself on the busy_list 803 - */ 804 801 rcu_read_lock(); 805 802 for_each_node(node) { 806 803 struct io_wqe *wqe = wq->wqes[node]; ··· 1044 1049 spin_lock_init(&wqe->lock); 1045 1050 INIT_WQ_LIST(&wqe->work_list); 1046 1051 INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); 1047 - INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); 1048 1052 INIT_LIST_HEAD(&wqe->all_list); 1049 1053 } 1050 1054
+355 -335
fs/io_uring.c
··· 330 330 struct file *file; 331 331 u64 addr; 332 332 int flags; 333 + unsigned count; 334 + }; 335 + 336 + struct io_rw { 337 + /* NOTE: kiocb has the file as the first member, so don't do it here */ 338 + struct kiocb kiocb; 339 + u64 addr; 340 + u64 len; 341 + }; 342 + 343 + struct io_connect { 344 + struct file *file; 345 + struct sockaddr __user *addr; 346 + int addr_len; 347 + }; 348 + 349 + struct io_sr_msg { 350 + struct file *file; 351 + struct user_msghdr __user *msg; 352 + int msg_flags; 333 353 }; 334 354 335 355 struct io_async_connect { ··· 371 351 }; 372 352 373 353 struct io_async_ctx { 374 - struct io_uring_sqe sqe; 375 354 union { 376 355 struct io_async_rw rw; 377 356 struct io_async_msghdr msg; ··· 388 369 struct io_kiocb { 389 370 union { 390 371 struct file *file; 391 - struct kiocb rw; 372 + struct io_rw rw; 392 373 struct io_poll_iocb poll; 393 374 struct io_accept accept; 394 375 struct io_sync sync; 395 376 struct io_cancel cancel; 396 377 struct io_timeout timeout; 378 + struct io_connect connect; 379 + struct io_sr_msg sr_msg; 397 380 }; 398 381 399 - const struct io_uring_sqe *sqe; 400 382 struct io_async_ctx *io; 401 383 struct file *ring_file; 402 384 int ring_fd; ··· 431 411 #define REQ_F_INFLIGHT 16384 /* on inflight list */ 432 412 #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ 433 413 #define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ 434 - #define REQ_F_PREPPED 131072 /* request already opcode prepared */ 435 414 u64 user_data; 436 415 u32 result; 437 416 u32 sequence; ··· 628 609 { 629 610 bool do_hashed = false; 630 611 631 - if (req->sqe) { 632 - switch (req->opcode) { 633 - case IORING_OP_WRITEV: 634 - case IORING_OP_WRITE_FIXED: 635 - /* only regular files should be hashed for writes */ 636 - if (req->flags & REQ_F_ISREG) 637 - do_hashed = true; 638 - /* fall-through */ 639 - case IORING_OP_READV: 640 - case IORING_OP_READ_FIXED: 641 - case IORING_OP_SENDMSG: 642 - case IORING_OP_RECVMSG: 643 - case IORING_OP_ACCEPT: 644 - case IORING_OP_POLL_ADD: 645 - case IORING_OP_CONNECT: 646 - /* 647 - * We know REQ_F_ISREG is not set on some of these 648 - * opcodes, but this enables us to keep the check in 649 - * just one place. 650 - */ 651 - if (!(req->flags & REQ_F_ISREG)) 652 - req->work.flags |= IO_WQ_WORK_UNBOUND; 653 - break; 654 - } 655 - if (io_req_needs_user(req)) 656 - req->work.flags |= IO_WQ_WORK_NEEDS_USER; 612 + switch (req->opcode) { 613 + case IORING_OP_WRITEV: 614 + case IORING_OP_WRITE_FIXED: 615 + /* only regular files should be hashed for writes */ 616 + if (req->flags & REQ_F_ISREG) 617 + do_hashed = true; 618 + /* fall-through */ 619 + case IORING_OP_READV: 620 + case IORING_OP_READ_FIXED: 621 + case IORING_OP_SENDMSG: 622 + case IORING_OP_RECVMSG: 623 + case IORING_OP_ACCEPT: 624 + case IORING_OP_POLL_ADD: 625 + case IORING_OP_CONNECT: 626 + /* 627 + * We know REQ_F_ISREG is not set on some of these 628 + * opcodes, but this enables us to keep the check in 629 + * just one place. 630 + */ 631 + if (!(req->flags & REQ_F_ISREG)) 632 + req->work.flags |= IO_WQ_WORK_UNBOUND; 633 + break; 657 634 } 635 + if (io_req_needs_user(req)) 636 + req->work.flags |= IO_WQ_WORK_NEEDS_USER; 658 637 659 638 *link = io_prep_linked_timeout(req); 660 639 return do_hashed; ··· 1197 1180 1198 1181 ret = 0; 1199 1182 list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { 1200 - struct kiocb *kiocb = &req->rw; 1183 + struct kiocb *kiocb = &req->rw.kiocb; 1201 1184 1202 1185 /* 1203 1186 * Move completed entries to our local list. If we find a ··· 1352 1335 1353 1336 static void io_complete_rw_common(struct kiocb *kiocb, long res) 1354 1337 { 1355 - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 1338 + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 1356 1339 1357 1340 if (kiocb->ki_flags & IOCB_WRITE) 1358 1341 kiocb_end_write(req); ··· 1364 1347 1365 1348 static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 1366 1349 { 1367 - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 1350 + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 1368 1351 1369 1352 io_complete_rw_common(kiocb, res); 1370 1353 io_put_req(req); ··· 1372 1355 1373 1356 static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) 1374 1357 { 1375 - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 1358 + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 1376 1359 struct io_kiocb *nxt = NULL; 1377 1360 1378 1361 io_complete_rw_common(kiocb, res); ··· 1383 1366 1384 1367 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 1385 1368 { 1386 - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 1369 + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 1387 1370 1388 1371 if (kiocb->ki_flags & IOCB_WRITE) 1389 1372 kiocb_end_write(req); ··· 1417 1400 1418 1401 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, 1419 1402 list); 1420 - if (list_req->rw.ki_filp != req->rw.ki_filp) 1403 + if (list_req->file != req->file) 1421 1404 ctx->poll_multi_file = true; 1422 1405 } 1423 1406 ··· 1488 1471 return false; 1489 1472 } 1490 1473 1491 - static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) 1474 + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1475 + bool force_nonblock) 1492 1476 { 1493 - const struct io_uring_sqe *sqe = req->sqe; 1494 1477 struct io_ring_ctx *ctx = req->ctx; 1495 - struct kiocb *kiocb = &req->rw; 1478 + struct kiocb *kiocb = &req->rw.kiocb; 1496 1479 unsigned ioprio; 1497 1480 int ret; 1498 1481 ··· 1541 1524 return -EINVAL; 1542 1525 kiocb->ki_complete = io_complete_rw; 1543 1526 } 1527 + 1528 + req->rw.addr = READ_ONCE(sqe->addr); 1529 + req->rw.len = READ_ONCE(sqe->len); 1530 + /* we own ->private, reuse it for the buffer index */ 1531 + req->rw.kiocb.private = (void *) (unsigned long) 1532 + READ_ONCE(sqe->buf_index); 1544 1533 return 0; 1545 1534 } 1546 1535 ··· 1580 1557 io_rw_done(kiocb, ret); 1581 1558 } 1582 1559 1583 - static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, 1584 - const struct io_uring_sqe *sqe, 1560 + static ssize_t io_import_fixed(struct io_kiocb *req, int rw, 1585 1561 struct iov_iter *iter) 1586 1562 { 1587 - size_t len = READ_ONCE(sqe->len); 1563 + struct io_ring_ctx *ctx = req->ctx; 1564 + size_t len = req->rw.len; 1588 1565 struct io_mapped_ubuf *imu; 1589 1566 unsigned index, buf_index; 1590 1567 size_t offset; ··· 1594 1571 if (unlikely(!ctx->user_bufs)) 1595 1572 return -EFAULT; 1596 1573 1597 - buf_index = READ_ONCE(sqe->buf_index); 1574 + buf_index = (unsigned long) req->rw.kiocb.private; 1598 1575 if (unlikely(buf_index >= ctx->nr_user_bufs)) 1599 1576 return -EFAULT; 1600 1577 1601 1578 index = array_index_nospec(buf_index, ctx->nr_user_bufs); 1602 1579 imu = &ctx->user_bufs[index]; 1603 - buf_addr = READ_ONCE(sqe->addr); 1580 + buf_addr = req->rw.addr; 1604 1581 1605 1582 /* overflow */ 1606 1583 if (buf_addr + len < buf_addr) ··· 1657 1634 static ssize_t io_import_iovec(int rw, struct io_kiocb *req, 1658 1635 struct iovec **iovec, struct iov_iter *iter) 1659 1636 { 1660 - const struct io_uring_sqe *sqe = req->sqe; 1661 - void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1662 - size_t sqe_len = READ_ONCE(sqe->len); 1637 + void __user *buf = u64_to_user_ptr(req->rw.addr); 1638 + size_t sqe_len = req->rw.len; 1663 1639 u8 opcode; 1664 1640 1665 - /* 1666 - * We're reading ->opcode for the second time, but the first read 1667 - * doesn't care whether it's _FIXED or not, so it doesn't matter 1668 - * whether ->opcode changes concurrently. The first read does care 1669 - * about whether it is a READ or a WRITE, so we don't trust this read 1670 - * for that purpose and instead let the caller pass in the read/write 1671 - * flag. 1672 - */ 1673 1641 opcode = req->opcode; 1674 1642 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 1675 1643 *iovec = NULL; 1676 - return io_import_fixed(req->ctx, rw, sqe, iter); 1644 + return io_import_fixed(req, rw, iter); 1677 1645 } 1646 + 1647 + /* buffer index only valid with fixed read/write */ 1648 + if (req->rw.kiocb.private) 1649 + return -EINVAL; 1678 1650 1679 1651 if (req->io) { 1680 1652 struct io_async_rw *iorw = &req->io->rw; ··· 1768 1750 static int io_alloc_async_ctx(struct io_kiocb *req) 1769 1751 { 1770 1752 req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); 1771 - if (req->io) { 1772 - memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); 1773 - req->sqe = &req->io->sqe; 1774 - return 0; 1775 - } 1776 - 1777 - return 1; 1753 + return req->io == NULL; 1778 1754 } 1779 1755 1780 1756 static void io_rw_async(struct io_wq_work **workptr) ··· 1794 1782 return 0; 1795 1783 } 1796 1784 1797 - static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, 1798 - struct iov_iter *iter, bool force_nonblock) 1785 + static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1786 + bool force_nonblock) 1799 1787 { 1788 + struct io_async_ctx *io; 1789 + struct iov_iter iter; 1800 1790 ssize_t ret; 1801 1791 1802 - ret = io_prep_rw(req, force_nonblock); 1792 + ret = io_prep_rw(req, sqe, force_nonblock); 1803 1793 if (ret) 1804 1794 return ret; 1805 1795 1806 1796 if (unlikely(!(req->file->f_mode & FMODE_READ))) 1807 1797 return -EBADF; 1808 1798 1809 - return io_import_iovec(READ, req, iovec, iter); 1799 + if (!req->io) 1800 + return 0; 1801 + 1802 + io = req->io; 1803 + io->rw.iov = io->rw.fast_iov; 1804 + req->io = NULL; 1805 + ret = io_import_iovec(READ, req, &io->rw.iov, &iter); 1806 + req->io = io; 1807 + if (ret < 0) 1808 + return ret; 1809 + 1810 + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); 1811 + return 0; 1810 1812 } 1811 1813 1812 1814 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, 1813 1815 bool force_nonblock) 1814 1816 { 1815 1817 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 1816 - struct kiocb *kiocb = &req->rw; 1818 + struct kiocb *kiocb = &req->rw.kiocb; 1817 1819 struct iov_iter iter; 1818 - struct file *file; 1819 1820 size_t iov_count; 1820 1821 ssize_t io_size, ret; 1821 1822 1822 - if (!req->io) { 1823 - ret = io_read_prep(req, &iovec, &iter, force_nonblock); 1824 - if (ret < 0) 1825 - return ret; 1826 - } else { 1827 - ret = io_import_iovec(READ, req, &iovec, &iter); 1828 - if (ret < 0) 1829 - return ret; 1830 - } 1823 + ret = io_import_iovec(READ, req, &iovec, &iter); 1824 + if (ret < 0) 1825 + return ret; 1831 1826 1832 1827 /* Ensure we clear previously set non-block flag */ 1833 1828 if (!force_nonblock) 1834 - req->rw.ki_flags &= ~IOCB_NOWAIT; 1829 + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; 1835 1830 1836 - file = req->file; 1837 1831 io_size = ret; 1838 1832 if (req->flags & REQ_F_LINK) 1839 1833 req->result = io_size; ··· 1848 1830 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 1849 1831 * we know to async punt it even if it was opened O_NONBLOCK 1850 1832 */ 1851 - if (force_nonblock && !io_file_supports_async(file)) { 1833 + if (force_nonblock && !io_file_supports_async(req->file)) { 1852 1834 req->flags |= REQ_F_MUST_PUNT; 1853 1835 goto copy_iov; 1854 1836 } 1855 1837 1856 1838 iov_count = iov_iter_count(&iter); 1857 - ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); 1839 + ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); 1858 1840 if (!ret) { 1859 1841 ssize_t ret2; 1860 1842 1861 - if (file->f_op->read_iter) 1862 - ret2 = call_read_iter(file, kiocb, &iter); 1843 + if (req->file->f_op->read_iter) 1844 + ret2 = call_read_iter(req->file, kiocb, &iter); 1863 1845 else 1864 - ret2 = loop_rw_iter(READ, file, kiocb, &iter); 1846 + ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); 1865 1847 1866 1848 /* 1867 1849 * In case of a short read, punt to async. This can happen ··· 1893 1875 return ret; 1894 1876 } 1895 1877 1896 - static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, 1897 - struct iov_iter *iter, bool force_nonblock) 1878 + static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 1879 + bool force_nonblock) 1898 1880 { 1881 + struct io_async_ctx *io; 1882 + struct iov_iter iter; 1899 1883 ssize_t ret; 1900 1884 1901 - ret = io_prep_rw(req, force_nonblock); 1885 + ret = io_prep_rw(req, sqe, force_nonblock); 1902 1886 if (ret) 1903 1887 return ret; 1904 1888 1905 1889 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 1906 1890 return -EBADF; 1907 1891 1908 - return io_import_iovec(WRITE, req, iovec, iter); 1892 + if (!req->io) 1893 + return 0; 1894 + 1895 + io = req->io; 1896 + io->rw.iov = io->rw.fast_iov; 1897 + req->io = NULL; 1898 + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); 1899 + req->io = io; 1900 + if (ret < 0) 1901 + return ret; 1902 + 1903 + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); 1904 + return 0; 1909 1905 } 1910 1906 1911 1907 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, 1912 1908 bool force_nonblock) 1913 1909 { 1914 1910 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 1915 - struct kiocb *kiocb = &req->rw; 1911 + struct kiocb *kiocb = &req->rw.kiocb; 1916 1912 struct iov_iter iter; 1917 - struct file *file; 1918 1913 size_t iov_count; 1919 1914 ssize_t ret, io_size; 1920 1915 1921 - if (!req->io) { 1922 - ret = io_write_prep(req, &iovec, &iter, force_nonblock); 1923 - if (ret < 0) 1924 - return ret; 1925 - } else { 1926 - ret = io_import_iovec(WRITE, req, &iovec, &iter); 1927 - if (ret < 0) 1928 - return ret; 1929 - } 1916 + ret = io_import_iovec(WRITE, req, &iovec, &iter); 1917 + if (ret < 0) 1918 + return ret; 1930 1919 1931 1920 /* Ensure we clear previously set non-block flag */ 1932 1921 if (!force_nonblock) 1933 - req->rw.ki_flags &= ~IOCB_NOWAIT; 1922 + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; 1934 1923 1935 - file = kiocb->ki_filp; 1936 1924 io_size = ret; 1937 1925 if (req->flags & REQ_F_LINK) 1938 1926 req->result = io_size; ··· 1958 1934 goto copy_iov; 1959 1935 1960 1936 iov_count = iov_iter_count(&iter); 1961 - ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); 1937 + ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); 1962 1938 if (!ret) { 1963 1939 ssize_t ret2; 1964 1940 ··· 1970 1946 * we return to userspace. 1971 1947 */ 1972 1948 if (req->flags & REQ_F_ISREG) { 1973 - __sb_start_write(file_inode(file)->i_sb, 1949 + __sb_start_write(file_inode(req->file)->i_sb, 1974 1950 SB_FREEZE_WRITE, true); 1975 - __sb_writers_release(file_inode(file)->i_sb, 1951 + __sb_writers_release(file_inode(req->file)->i_sb, 1976 1952 SB_FREEZE_WRITE); 1977 1953 } 1978 1954 kiocb->ki_flags |= IOCB_WRITE; 1979 1955 1980 - if (file->f_op->write_iter) 1981 - ret2 = call_write_iter(file, kiocb, &iter); 1956 + if (req->file->f_op->write_iter) 1957 + ret2 = call_write_iter(req->file, kiocb, &iter); 1982 1958 else 1983 - ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); 1959 + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); 1984 1960 if (!force_nonblock || ret2 != -EAGAIN) { 1985 1961 kiocb_done(kiocb, ret2, nxt, req->in_async); 1986 1962 } else { ··· 2013 1989 return 0; 2014 1990 } 2015 1991 2016 - static int io_prep_fsync(struct io_kiocb *req) 1992 + static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2017 1993 { 2018 - const struct io_uring_sqe *sqe = req->sqe; 2019 1994 struct io_ring_ctx *ctx = req->ctx; 2020 1995 2021 - if (req->flags & REQ_F_PREPPED) 2022 - return 0; 2023 1996 if (!req->file) 2024 1997 return -EBADF; 2025 1998 ··· 2031 2010 2032 2011 req->sync.off = READ_ONCE(sqe->off); 2033 2012 req->sync.len = READ_ONCE(sqe->len); 2034 - req->flags |= REQ_F_PREPPED; 2035 2013 return 0; 2036 2014 } 2037 2015 ··· 2056 2036 if (io_req_cancelled(req)) 2057 2037 return; 2058 2038 2059 - ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, 2039 + ret = vfs_fsync_range(req->file, req->sync.off, 2060 2040 end > 0 ? end : LLONG_MAX, 2061 2041 req->sync.flags & IORING_FSYNC_DATASYNC); 2062 2042 if (ret < 0) ··· 2071 2051 bool force_nonblock) 2072 2052 { 2073 2053 struct io_wq_work *work, *old_work; 2074 - int ret; 2075 - 2076 - ret = io_prep_fsync(req); 2077 - if (ret) 2078 - return ret; 2079 2054 2080 2055 /* fsync always requires a blocking context */ 2081 2056 if (force_nonblock) { ··· 2086 2071 return 0; 2087 2072 } 2088 2073 2089 - static int io_prep_sfr(struct io_kiocb *req) 2074 + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2090 2075 { 2091 - const struct io_uring_sqe *sqe = req->sqe; 2092 2076 struct io_ring_ctx *ctx = req->ctx; 2093 2077 2094 - if (req->flags & REQ_F_PREPPED) 2095 - return 0; 2096 2078 if (!req->file) 2097 2079 return -EBADF; 2098 2080 ··· 2101 2089 req->sync.off = READ_ONCE(sqe->off); 2102 2090 req->sync.len = READ_ONCE(sqe->len); 2103 2091 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 2104 - req->flags |= REQ_F_PREPPED; 2105 2092 return 0; 2106 2093 } 2107 2094 ··· 2113 2102 if (io_req_cancelled(req)) 2114 2103 return; 2115 2104 2116 - ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, 2105 + ret = sync_file_range(req->file, req->sync.off, req->sync.len, 2117 2106 req->sync.flags); 2118 2107 if (ret < 0) 2119 2108 req_set_fail_links(req); ··· 2127 2116 bool force_nonblock) 2128 2117 { 2129 2118 struct io_wq_work *work, *old_work; 2130 - int ret; 2131 - 2132 - ret = io_prep_sfr(req); 2133 - if (ret) 2134 - return ret; 2135 2119 2136 2120 /* sync_file_range always requires a blocking context */ 2137 2121 if (force_nonblock) { ··· 2155 2149 } 2156 2150 #endif 2157 2151 2158 - static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) 2152 + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2159 2153 { 2160 2154 #if defined(CONFIG_NET) 2161 - const struct io_uring_sqe *sqe = req->sqe; 2162 - struct user_msghdr __user *msg; 2163 - unsigned flags; 2155 + struct io_sr_msg *sr = &req->sr_msg; 2156 + struct io_async_ctx *io = req->io; 2164 2157 2165 - flags = READ_ONCE(sqe->msg_flags); 2166 - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); 2158 + sr->msg_flags = READ_ONCE(sqe->msg_flags); 2159 + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 2160 + 2161 + if (!io) 2162 + return 0; 2163 + 2167 2164 io->msg.iov = io->msg.fast_iov; 2168 - return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); 2165 + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, 2166 + &io->msg.iov); 2169 2167 #else 2170 - return 0; 2168 + return -EOPNOTSUPP; 2171 2169 #endif 2172 2170 } 2173 2171 ··· 2179 2169 bool force_nonblock) 2180 2170 { 2181 2171 #if defined(CONFIG_NET) 2182 - const struct io_uring_sqe *sqe = req->sqe; 2183 2172 struct io_async_msghdr *kmsg = NULL; 2184 2173 struct socket *sock; 2185 2174 int ret; ··· 2192 2183 struct sockaddr_storage addr; 2193 2184 unsigned flags; 2194 2185 2195 - flags = READ_ONCE(sqe->msg_flags); 2196 - if (flags & MSG_DONTWAIT) 2197 - req->flags |= REQ_F_NOWAIT; 2198 - else if (force_nonblock) 2199 - flags |= MSG_DONTWAIT; 2200 - 2201 2186 if (req->io) { 2202 2187 kmsg = &req->io->msg; 2203 2188 kmsg->msg.msg_name = &addr; ··· 2200 2197 kmsg->iov = kmsg->fast_iov; 2201 2198 kmsg->msg.msg_iter.iov = kmsg->iov; 2202 2199 } else { 2200 + struct io_sr_msg *sr = &req->sr_msg; 2201 + 2203 2202 kmsg = &io.msg; 2204 2203 kmsg->msg.msg_name = &addr; 2205 - ret = io_sendmsg_prep(req, &io); 2204 + 2205 + io.msg.iov = io.msg.fast_iov; 2206 + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, 2207 + sr->msg_flags, &io.msg.iov); 2206 2208 if (ret) 2207 - goto out; 2209 + return ret; 2208 2210 } 2211 + 2212 + flags = req->sr_msg.msg_flags; 2213 + if (flags & MSG_DONTWAIT) 2214 + req->flags |= REQ_F_NOWAIT; 2215 + else if (force_nonblock) 2216 + flags |= MSG_DONTWAIT; 2209 2217 2210 2218 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 2211 2219 if (force_nonblock && ret == -EAGAIN) { ··· 2232 2218 ret = -EINTR; 2233 2219 } 2234 2220 2235 - out: 2236 2221 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) 2237 2222 kfree(kmsg->iov); 2238 2223 io_cqring_add_event(req, ret); ··· 2244 2231 #endif 2245 2232 } 2246 2233 2247 - static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) 2234 + static int io_recvmsg_prep(struct io_kiocb *req, 2235 + const struct io_uring_sqe *sqe) 2248 2236 { 2249 2237 #if defined(CONFIG_NET) 2250 - const struct io_uring_sqe *sqe = req->sqe; 2251 - struct user_msghdr __user *msg; 2252 - unsigned flags; 2238 + struct io_sr_msg *sr = &req->sr_msg; 2239 + struct io_async_ctx *io = req->io; 2253 2240 2254 - flags = READ_ONCE(sqe->msg_flags); 2255 - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); 2241 + sr->msg_flags = READ_ONCE(sqe->msg_flags); 2242 + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 2243 + 2244 + if (!io) 2245 + return 0; 2246 + 2256 2247 io->msg.iov = io->msg.fast_iov; 2257 - return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, 2258 - &io->msg.iov); 2248 + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, 2249 + &io->msg.uaddr, &io->msg.iov); 2259 2250 #else 2260 - return 0; 2251 + return -EOPNOTSUPP; 2261 2252 #endif 2262 2253 } 2263 2254 ··· 2269 2252 bool force_nonblock) 2270 2253 { 2271 2254 #if defined(CONFIG_NET) 2272 - const struct io_uring_sqe *sqe = req->sqe; 2273 2255 struct io_async_msghdr *kmsg = NULL; 2274 2256 struct socket *sock; 2275 2257 int ret; ··· 2278 2262 2279 2263 sock = sock_from_file(req->file, &ret); 2280 2264 if (sock) { 2281 - struct user_msghdr __user *msg; 2282 2265 struct io_async_ctx io; 2283 2266 struct sockaddr_storage addr; 2284 2267 unsigned flags; 2285 2268 2286 - flags = READ_ONCE(sqe->msg_flags); 2287 - if (flags & MSG_DONTWAIT) 2288 - req->flags |= REQ_F_NOWAIT; 2289 - else if (force_nonblock) 2290 - flags |= MSG_DONTWAIT; 2291 - 2292 - msg = (struct user_msghdr __user *) (unsigned long) 2293 - READ_ONCE(sqe->addr); 2294 2269 if (req->io) { 2295 2270 kmsg = &req->io->msg; 2296 2271 kmsg->msg.msg_name = &addr; ··· 2290 2283 kmsg->iov = kmsg->fast_iov; 2291 2284 kmsg->msg.msg_iter.iov = kmsg->iov; 2292 2285 } else { 2286 + struct io_sr_msg *sr = &req->sr_msg; 2287 + 2293 2288 kmsg = &io.msg; 2294 2289 kmsg->msg.msg_name = &addr; 2295 - ret = io_recvmsg_prep(req, &io); 2290 + 2291 + io.msg.iov = io.msg.fast_iov; 2292 + ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, 2293 + sr->msg_flags, &io.msg.uaddr, 2294 + &io.msg.iov); 2296 2295 if (ret) 2297 - goto out; 2296 + return ret; 2298 2297 } 2299 2298 2300 - ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); 2299 + flags = req->sr_msg.msg_flags; 2300 + if (flags & MSG_DONTWAIT) 2301 + req->flags |= REQ_F_NOWAIT; 2302 + else if (force_nonblock) 2303 + flags |= MSG_DONTWAIT; 2304 + 2305 + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, 2306 + kmsg->uaddr, flags); 2301 2307 if (force_nonblock && ret == -EAGAIN) { 2302 2308 if (req->io) 2303 2309 return -EAGAIN; ··· 2324 2304 ret = -EINTR; 2325 2305 } 2326 2306 2327 - out: 2328 2307 if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) 2329 2308 kfree(kmsg->iov); 2330 2309 io_cqring_add_event(req, ret); ··· 2336 2317 #endif 2337 2318 } 2338 2319 2339 - static int io_accept_prep(struct io_kiocb *req) 2320 + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2340 2321 { 2341 2322 #if defined(CONFIG_NET) 2342 - const struct io_uring_sqe *sqe = req->sqe; 2343 2323 struct io_accept *accept = &req->accept; 2344 - 2345 - if (req->flags & REQ_F_PREPPED) 2346 - return 0; 2347 2324 2348 2325 if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 2349 2326 return -EINVAL; 2350 2327 if (sqe->ioprio || sqe->len || sqe->buf_index) 2351 2328 return -EINVAL; 2352 2329 2353 - accept->addr = (struct sockaddr __user *) 2354 - (unsigned long) READ_ONCE(sqe->addr); 2355 - accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); 2330 + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 2331 + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 2356 2332 accept->flags = READ_ONCE(sqe->accept_flags); 2357 - req->flags |= REQ_F_PREPPED; 2358 2333 return 0; 2359 2334 #else 2360 2335 return -EOPNOTSUPP; ··· 2396 2383 #if defined(CONFIG_NET) 2397 2384 int ret; 2398 2385 2399 - ret = io_accept_prep(req); 2400 - if (ret) 2401 - return ret; 2402 - 2403 2386 ret = __io_accept(req, nxt, force_nonblock); 2404 2387 if (ret == -EAGAIN && force_nonblock) { 2405 2388 req->work.func = io_accept_finish; ··· 2409 2400 #endif 2410 2401 } 2411 2402 2412 - static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) 2403 + static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2413 2404 { 2414 2405 #if defined(CONFIG_NET) 2415 - const struct io_uring_sqe *sqe = req->sqe; 2416 - struct sockaddr __user *addr; 2417 - int addr_len; 2406 + struct io_connect *conn = &req->connect; 2407 + struct io_async_ctx *io = req->io; 2418 2408 2419 - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); 2420 - addr_len = READ_ONCE(sqe->addr2); 2421 - return move_addr_to_kernel(addr, addr_len, &io->connect.address); 2409 + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 2410 + return -EINVAL; 2411 + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 2412 + return -EINVAL; 2413 + 2414 + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 2415 + conn->addr_len = READ_ONCE(sqe->addr2); 2416 + 2417 + if (!io) 2418 + return 0; 2419 + 2420 + return move_addr_to_kernel(conn->addr, conn->addr_len, 2421 + &io->connect.address); 2422 2422 #else 2423 - return 0; 2423 + return -EOPNOTSUPP; 2424 2424 #endif 2425 2425 } 2426 2426 ··· 2437 2419 bool force_nonblock) 2438 2420 { 2439 2421 #if defined(CONFIG_NET) 2440 - const struct io_uring_sqe *sqe = req->sqe; 2441 2422 struct io_async_ctx __io, *io; 2442 2423 unsigned file_flags; 2443 - int addr_len, ret; 2444 - 2445 - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) 2446 - return -EINVAL; 2447 - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) 2448 - return -EINVAL; 2449 - 2450 - addr_len = READ_ONCE(sqe->addr2); 2451 - file_flags = force_nonblock ? O_NONBLOCK : 0; 2424 + int ret; 2452 2425 2453 2426 if (req->io) { 2454 2427 io = req->io; 2455 2428 } else { 2456 - ret = io_connect_prep(req, &__io); 2429 + ret = move_addr_to_kernel(req->connect.addr, 2430 + req->connect.addr_len, 2431 + &__io.connect.address); 2457 2432 if (ret) 2458 2433 goto out; 2459 2434 io = &__io; 2460 2435 } 2461 2436 2462 - ret = __sys_connect_file(req->file, &io->connect.address, addr_len, 2463 - file_flags); 2437 + file_flags = force_nonblock ? O_NONBLOCK : 0; 2438 + 2439 + ret = __sys_connect_file(req->file, &io->connect.address, 2440 + req->connect.addr_len, file_flags); 2464 2441 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 2465 2442 if (req->io) 2466 2443 return -EAGAIN; ··· 2526 2513 return -ENOENT; 2527 2514 } 2528 2515 2529 - static int io_poll_remove_prep(struct io_kiocb *req) 2516 + static int io_poll_remove_prep(struct io_kiocb *req, 2517 + const struct io_uring_sqe *sqe) 2530 2518 { 2531 - const struct io_uring_sqe *sqe = req->sqe; 2532 - 2533 - if (req->flags & REQ_F_PREPPED) 2534 - return 0; 2535 2519 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 2536 2520 return -EINVAL; 2537 2521 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || ··· 2536 2526 return -EINVAL; 2537 2527 2538 2528 req->poll.addr = READ_ONCE(sqe->addr); 2539 - req->flags |= REQ_F_PREPPED; 2540 2529 return 0; 2541 2530 } 2542 2531 ··· 2548 2539 struct io_ring_ctx *ctx = req->ctx; 2549 2540 u64 addr; 2550 2541 int ret; 2551 - 2552 - ret = io_poll_remove_prep(req); 2553 - if (ret) 2554 - return ret; 2555 2542 2556 2543 addr = req->poll.addr; 2557 2544 spin_lock_irq(&ctx->completion_lock); ··· 2686 2681 hlist_add_head(&req->hash_node, list); 2687 2682 } 2688 2683 2689 - static int io_poll_add_prep(struct io_kiocb *req) 2684 + static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 2690 2685 { 2691 - const struct io_uring_sqe *sqe = req->sqe; 2692 2686 struct io_poll_iocb *poll = &req->poll; 2693 2687 u16 events; 2694 2688 2695 - if (req->flags & REQ_F_PREPPED) 2696 - return 0; 2697 2689 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 2698 2690 return -EINVAL; 2699 2691 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) ··· 2698 2696 if (!poll->file) 2699 2697 return -EBADF; 2700 2698 2701 - req->flags |= REQ_F_PREPPED; 2702 2699 events = READ_ONCE(sqe->poll_events); 2703 2700 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 2704 2701 return 0; ··· 2710 2709 struct io_poll_table ipt; 2711 2710 bool cancel = false; 2712 2711 __poll_t mask; 2713 - int ret; 2714 - 2715 - ret = io_poll_add_prep(req); 2716 - if (ret) 2717 - return ret; 2718 2712 2719 2713 INIT_IO_WORK(&req->work, io_poll_complete_work); 2720 2714 INIT_HLIST_NODE(&req->hash_node); ··· 2828 2832 return 0; 2829 2833 } 2830 2834 2831 - static int io_timeout_remove_prep(struct io_kiocb *req) 2835 + static int io_timeout_remove_prep(struct io_kiocb *req, 2836 + const struct io_uring_sqe *sqe) 2832 2837 { 2833 - const struct io_uring_sqe *sqe = req->sqe; 2834 - 2835 - if (req->flags & REQ_F_PREPPED) 2836 - return 0; 2837 2838 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 2838 2839 return -EINVAL; 2839 2840 if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) ··· 2841 2848 if (req->timeout.flags) 2842 2849 return -EINVAL; 2843 2850 2844 - req->flags |= REQ_F_PREPPED; 2845 2851 return 0; 2846 2852 } 2847 2853 ··· 2851 2859 { 2852 2860 struct io_ring_ctx *ctx = req->ctx; 2853 2861 int ret; 2854 - 2855 - ret = io_timeout_remove_prep(req); 2856 - if (ret) 2857 - return ret; 2858 2862 2859 2863 spin_lock_irq(&ctx->completion_lock); 2860 2864 ret = io_timeout_cancel(ctx, req->timeout.addr); ··· 2865 2877 return 0; 2866 2878 } 2867 2879 2868 - static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, 2880 + static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2869 2881 bool is_timeout_link) 2870 2882 { 2871 - const struct io_uring_sqe *sqe = req->sqe; 2872 2883 struct io_timeout_data *data; 2873 2884 unsigned flags; 2874 2885 ··· 2881 2894 if (flags & ~IORING_TIMEOUT_ABS) 2882 2895 return -EINVAL; 2883 2896 2884 - data = &io->timeout; 2897 + req->timeout.count = READ_ONCE(sqe->off); 2898 + 2899 + if (!req->io && io_alloc_async_ctx(req)) 2900 + return -ENOMEM; 2901 + 2902 + data = &req->io->timeout; 2885 2903 data->req = req; 2886 2904 req->flags |= REQ_F_TIMEOUT; 2887 2905 ··· 2904 2912 2905 2913 static int io_timeout(struct io_kiocb *req) 2906 2914 { 2907 - const struct io_uring_sqe *sqe = req->sqe; 2908 2915 unsigned count; 2909 2916 struct io_ring_ctx *ctx = req->ctx; 2910 2917 struct io_timeout_data *data; 2911 2918 struct list_head *entry; 2912 2919 unsigned span = 0; 2913 - int ret; 2914 2920 2915 - if (!req->io) { 2916 - if (io_alloc_async_ctx(req)) 2917 - return -ENOMEM; 2918 - ret = io_timeout_prep(req, req->io, false); 2919 - if (ret) 2920 - return ret; 2921 - } 2922 2921 data = &req->io->timeout; 2923 2922 2924 2923 /* ··· 2917 2934 * timeout event to be satisfied. If it isn't set, then this is 2918 2935 * a pure timeout request, sequence isn't used. 2919 2936 */ 2920 - count = READ_ONCE(sqe->off); 2937 + count = req->timeout.count; 2921 2938 if (!count) { 2922 2939 req->flags |= REQ_F_TIMEOUT_NOSEQ; 2923 2940 spin_lock_irq(&ctx->completion_lock); ··· 3035 3052 io_put_req_find_next(req, nxt); 3036 3053 } 3037 3054 3038 - static int io_async_cancel_prep(struct io_kiocb *req) 3055 + static int io_async_cancel_prep(struct io_kiocb *req, 3056 + const struct io_uring_sqe *sqe) 3039 3057 { 3040 - const struct io_uring_sqe *sqe = req->sqe; 3041 - 3042 - if (req->flags & REQ_F_PREPPED) 3043 - return 0; 3044 3058 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3045 3059 return -EINVAL; 3046 3060 if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || 3047 3061 sqe->cancel_flags) 3048 3062 return -EINVAL; 3049 3063 3050 - req->flags |= REQ_F_PREPPED; 3051 3064 req->cancel.addr = READ_ONCE(sqe->addr); 3052 3065 return 0; 3053 3066 } ··· 3051 3072 static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) 3052 3073 { 3053 3074 struct io_ring_ctx *ctx = req->ctx; 3054 - int ret; 3055 - 3056 - ret = io_async_cancel_prep(req); 3057 - if (ret) 3058 - return ret; 3059 3075 3060 3076 io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); 3061 3077 return 0; 3062 3078 } 3063 3079 3064 - static int io_req_defer_prep(struct io_kiocb *req) 3080 + static int io_req_defer_prep(struct io_kiocb *req, 3081 + const struct io_uring_sqe *sqe) 3065 3082 { 3066 - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3067 - struct io_async_ctx *io = req->io; 3068 - struct iov_iter iter; 3069 3083 ssize_t ret = 0; 3070 3084 3071 3085 switch (req->opcode) { ··· 3066 3094 break; 3067 3095 case IORING_OP_READV: 3068 3096 case IORING_OP_READ_FIXED: 3069 - /* ensure prep does right import */ 3070 - req->io = NULL; 3071 - ret = io_read_prep(req, &iovec, &iter, true); 3072 - req->io = io; 3073 - if (ret < 0) 3074 - break; 3075 - io_req_map_rw(req, ret, iovec, inline_vecs, &iter); 3076 - ret = 0; 3097 + ret = io_read_prep(req, sqe, true); 3077 3098 break; 3078 3099 case IORING_OP_WRITEV: 3079 3100 case IORING_OP_WRITE_FIXED: 3080 - /* ensure prep does right import */ 3081 - req->io = NULL; 3082 - ret = io_write_prep(req, &iovec, &iter, true); 3083 - req->io = io; 3084 - if (ret < 0) 3085 - break; 3086 - io_req_map_rw(req, ret, iovec, inline_vecs, &iter); 3087 - ret = 0; 3101 + ret = io_write_prep(req, sqe, true); 3088 3102 break; 3089 3103 case IORING_OP_POLL_ADD: 3090 - ret = io_poll_add_prep(req); 3104 + ret = io_poll_add_prep(req, sqe); 3091 3105 break; 3092 3106 case IORING_OP_POLL_REMOVE: 3093 - ret = io_poll_remove_prep(req); 3107 + ret = io_poll_remove_prep(req, sqe); 3094 3108 break; 3095 3109 case IORING_OP_FSYNC: 3096 - ret = io_prep_fsync(req); 3110 + ret = io_prep_fsync(req, sqe); 3097 3111 break; 3098 3112 case IORING_OP_SYNC_FILE_RANGE: 3099 - ret = io_prep_sfr(req); 3113 + ret = io_prep_sfr(req, sqe); 3100 3114 break; 3101 3115 case IORING_OP_SENDMSG: 3102 - ret = io_sendmsg_prep(req, io); 3116 + ret = io_sendmsg_prep(req, sqe); 3103 3117 break; 3104 3118 case IORING_OP_RECVMSG: 3105 - ret = io_recvmsg_prep(req, io); 3119 + ret = io_recvmsg_prep(req, sqe); 3106 3120 break; 3107 3121 case IORING_OP_CONNECT: 3108 - ret = io_connect_prep(req, io); 3122 + ret = io_connect_prep(req, sqe); 3109 3123 break; 3110 3124 case IORING_OP_TIMEOUT: 3111 - ret = io_timeout_prep(req, io, false); 3125 + ret = io_timeout_prep(req, sqe, false); 3112 3126 break; 3113 3127 case IORING_OP_TIMEOUT_REMOVE: 3114 - ret = io_timeout_remove_prep(req); 3128 + ret = io_timeout_remove_prep(req, sqe); 3115 3129 break; 3116 3130 case IORING_OP_ASYNC_CANCEL: 3117 - ret = io_async_cancel_prep(req); 3131 + ret = io_async_cancel_prep(req, sqe); 3118 3132 break; 3119 3133 case IORING_OP_LINK_TIMEOUT: 3120 - ret = io_timeout_prep(req, io, true); 3134 + ret = io_timeout_prep(req, sqe, true); 3121 3135 break; 3122 3136 case IORING_OP_ACCEPT: 3123 - ret = io_accept_prep(req); 3137 + ret = io_accept_prep(req, sqe); 3124 3138 break; 3125 3139 default: 3126 3140 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", ··· 3118 3160 return ret; 3119 3161 } 3120 3162 3121 - static int io_req_defer(struct io_kiocb *req) 3163 + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3122 3164 { 3123 3165 struct io_ring_ctx *ctx = req->ctx; 3124 3166 int ret; ··· 3127 3169 if (!req_need_defer(req) && list_empty(&ctx->defer_list)) 3128 3170 return 0; 3129 3171 3130 - if (io_alloc_async_ctx(req)) 3172 + if (!req->io && io_alloc_async_ctx(req)) 3131 3173 return -EAGAIN; 3132 3174 3133 - ret = io_req_defer_prep(req); 3175 + ret = io_req_defer_prep(req, sqe); 3134 3176 if (ret < 0) 3135 3177 return ret; 3136 3178 ··· 3146 3188 return -EIOCBQUEUED; 3147 3189 } 3148 3190 3149 - __attribute__((nonnull)) 3150 - static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, 3151 - bool force_nonblock) 3191 + static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 3192 + struct io_kiocb **nxt, bool force_nonblock) 3152 3193 { 3153 3194 struct io_ring_ctx *ctx = req->ctx; 3154 3195 int ret; ··· 3157 3200 ret = io_nop(req); 3158 3201 break; 3159 3202 case IORING_OP_READV: 3160 - if (unlikely(req->sqe->buf_index)) 3161 - return -EINVAL; 3203 + case IORING_OP_READ_FIXED: 3204 + if (sqe) { 3205 + ret = io_read_prep(req, sqe, force_nonblock); 3206 + if (ret < 0) 3207 + break; 3208 + } 3162 3209 ret = io_read(req, nxt, force_nonblock); 3163 3210 break; 3164 3211 case IORING_OP_WRITEV: 3165 - if (unlikely(req->sqe->buf_index)) 3166 - return -EINVAL; 3167 - ret = io_write(req, nxt, force_nonblock); 3168 - break; 3169 - case IORING_OP_READ_FIXED: 3170 - ret = io_read(req, nxt, force_nonblock); 3171 - break; 3172 3212 case IORING_OP_WRITE_FIXED: 3213 + if (sqe) { 3214 + ret = io_write_prep(req, sqe, force_nonblock); 3215 + if (ret < 0) 3216 + break; 3217 + } 3173 3218 ret = io_write(req, nxt, force_nonblock); 3174 3219 break; 3175 3220 case IORING_OP_FSYNC: 3221 + if (sqe) { 3222 + ret = io_prep_fsync(req, sqe); 3223 + if (ret < 0) 3224 + break; 3225 + } 3176 3226 ret = io_fsync(req, nxt, force_nonblock); 3177 3227 break; 3178 3228 case IORING_OP_POLL_ADD: 3229 + if (sqe) { 3230 + ret = io_poll_add_prep(req, sqe); 3231 + if (ret) 3232 + break; 3233 + } 3179 3234 ret = io_poll_add(req, nxt); 3180 3235 break; 3181 3236 case IORING_OP_POLL_REMOVE: 3237 + if (sqe) { 3238 + ret = io_poll_remove_prep(req, sqe); 3239 + if (ret < 0) 3240 + break; 3241 + } 3182 3242 ret = io_poll_remove(req); 3183 3243 break; 3184 3244 case IORING_OP_SYNC_FILE_RANGE: 3245 + if (sqe) { 3246 + ret = io_prep_sfr(req, sqe); 3247 + if (ret < 0) 3248 + break; 3249 + } 3185 3250 ret = io_sync_file_range(req, nxt, force_nonblock); 3186 3251 break; 3187 3252 case IORING_OP_SENDMSG: 3253 + if (sqe) { 3254 + ret = io_sendmsg_prep(req, sqe); 3255 + if (ret < 0) 3256 + break; 3257 + } 3188 3258 ret = io_sendmsg(req, nxt, force_nonblock); 3189 3259 break; 3190 3260 case IORING_OP_RECVMSG: 3261 + if (sqe) { 3262 + ret = io_recvmsg_prep(req, sqe); 3263 + if (ret) 3264 + break; 3265 + } 3191 3266 ret = io_recvmsg(req, nxt, force_nonblock); 3192 3267 break; 3193 3268 case IORING_OP_TIMEOUT: 3269 + if (sqe) { 3270 + ret = io_timeout_prep(req, sqe, false); 3271 + if (ret) 3272 + break; 3273 + } 3194 3274 ret = io_timeout(req); 3195 3275 break; 3196 3276 case IORING_OP_TIMEOUT_REMOVE: 3277 + if (sqe) { 3278 + ret = io_timeout_remove_prep(req, sqe); 3279 + if (ret) 3280 + break; 3281 + } 3197 3282 ret = io_timeout_remove(req); 3198 3283 break; 3199 3284 case IORING_OP_ACCEPT: 3285 + if (sqe) { 3286 + ret = io_accept_prep(req, sqe); 3287 + if (ret) 3288 + break; 3289 + } 3200 3290 ret = io_accept(req, nxt, force_nonblock); 3201 3291 break; 3202 3292 case IORING_OP_CONNECT: 3293 + if (sqe) { 3294 + ret = io_connect_prep(req, sqe); 3295 + if (ret) 3296 + break; 3297 + } 3203 3298 ret = io_connect(req, nxt, force_nonblock); 3204 3299 break; 3205 3300 case IORING_OP_ASYNC_CANCEL: 3301 + if (sqe) { 3302 + ret = io_async_cancel_prep(req, sqe); 3303 + if (ret) 3304 + break; 3305 + } 3206 3306 ret = io_async_cancel(req, nxt); 3207 3307 break; 3208 3308 default: ··· 3303 3289 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; 3304 3290 req->in_async = true; 3305 3291 do { 3306 - ret = io_issue_sqe(req, &nxt, false); 3292 + ret = io_issue_sqe(req, NULL, &nxt, false); 3307 3293 /* 3308 3294 * We can get EAGAIN for polled IO even though we're 3309 3295 * forcing a sync submission from here, since we can't ··· 3369 3355 return table->files[index & IORING_FILE_TABLE_MASK]; 3370 3356 } 3371 3357 3372 - static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) 3358 + static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, 3359 + const struct io_uring_sqe *sqe) 3373 3360 { 3374 3361 struct io_ring_ctx *ctx = req->ctx; 3375 3362 unsigned flags; 3376 3363 int fd, ret; 3377 3364 3378 - flags = READ_ONCE(req->sqe->flags); 3379 - fd = READ_ONCE(req->sqe->fd); 3365 + flags = READ_ONCE(sqe->flags); 3366 + fd = READ_ONCE(sqe->fd); 3380 3367 3381 3368 if (flags & IOSQE_IO_DRAIN) 3382 3369 req->flags |= REQ_F_IO_DRAIN; ··· 3509 3494 return nxt; 3510 3495 } 3511 3496 3512 - static void __io_queue_sqe(struct io_kiocb *req) 3497 + static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3513 3498 { 3514 3499 struct io_kiocb *linked_timeout; 3515 3500 struct io_kiocb *nxt = NULL; ··· 3518 3503 again: 3519 3504 linked_timeout = io_prep_linked_timeout(req); 3520 3505 3521 - ret = io_issue_sqe(req, &nxt, true); 3506 + ret = io_issue_sqe(req, sqe, &nxt, true); 3522 3507 3523 3508 /* 3524 3509 * We async punt it if the file wasn't marked NOWAIT, or if the file ··· 3565 3550 } 3566 3551 } 3567 3552 3568 - static void io_queue_sqe(struct io_kiocb *req) 3553 + static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3569 3554 { 3570 3555 int ret; 3571 3556 ··· 3575 3560 } 3576 3561 req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); 3577 3562 3578 - ret = io_req_defer(req); 3563 + ret = io_req_defer(req, sqe); 3579 3564 if (ret) { 3580 3565 if (ret != -EIOCBQUEUED) { 3581 3566 io_cqring_add_event(req, ret); ··· 3583 3568 io_double_put_req(req); 3584 3569 } 3585 3570 } else 3586 - __io_queue_sqe(req); 3571 + __io_queue_sqe(req, sqe); 3587 3572 } 3588 3573 3589 3574 static inline void io_queue_link_head(struct io_kiocb *req) ··· 3592 3577 io_cqring_add_event(req, -ECANCELED); 3593 3578 io_double_put_req(req); 3594 3579 } else 3595 - io_queue_sqe(req); 3580 + io_queue_sqe(req, NULL); 3596 3581 } 3597 3582 3598 3583 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ 3599 3584 IOSQE_IO_HARDLINK) 3600 3585 3601 - static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, 3602 - struct io_kiocb **link) 3586 + static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 3587 + struct io_submit_state *state, struct io_kiocb **link) 3603 3588 { 3604 3589 struct io_ring_ctx *ctx = req->ctx; 3605 3590 int ret; 3606 3591 3607 3592 /* enforce forwards compatibility on users */ 3608 - if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { 3593 + if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { 3609 3594 ret = -EINVAL; 3610 3595 goto err_req; 3611 3596 } 3612 3597 3613 - ret = io_req_set_file(state, req); 3598 + ret = io_req_set_file(state, req, sqe); 3614 3599 if (unlikely(ret)) { 3615 3600 err_req: 3616 3601 io_cqring_add_event(req, ret); ··· 3628 3613 if (*link) { 3629 3614 struct io_kiocb *prev = *link; 3630 3615 3631 - if (req->sqe->flags & IOSQE_IO_DRAIN) 3616 + if (sqe->flags & IOSQE_IO_DRAIN) 3632 3617 (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; 3633 3618 3634 - if (req->sqe->flags & IOSQE_IO_HARDLINK) 3619 + if (sqe->flags & IOSQE_IO_HARDLINK) 3635 3620 req->flags |= REQ_F_HARDLINK; 3636 3621 3637 3622 if (io_alloc_async_ctx(req)) { ··· 3639 3624 goto err_req; 3640 3625 } 3641 3626 3642 - ret = io_req_defer_prep(req); 3627 + ret = io_req_defer_prep(req, sqe); 3643 3628 if (ret) { 3644 3629 /* fail even hard links since we don't submit */ 3645 3630 prev->flags |= REQ_F_FAIL_LINK; ··· 3647 3632 } 3648 3633 trace_io_uring_link(ctx, req, prev); 3649 3634 list_add_tail(&req->link_list, &prev->link_list); 3650 - } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { 3635 + } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { 3651 3636 req->flags |= REQ_F_LINK; 3652 - if (req->sqe->flags & IOSQE_IO_HARDLINK) 3637 + if (sqe->flags & IOSQE_IO_HARDLINK) 3653 3638 req->flags |= REQ_F_HARDLINK; 3654 3639 3655 3640 INIT_LIST_HEAD(&req->link_list); 3641 + ret = io_req_defer_prep(req, sqe); 3642 + if (ret) 3643 + req->flags |= REQ_F_FAIL_LINK; 3656 3644 *link = req; 3657 3645 } else { 3658 - io_queue_sqe(req); 3646 + io_queue_sqe(req, sqe); 3659 3647 } 3660 3648 3661 3649 return true; ··· 3703 3685 } 3704 3686 3705 3687 /* 3706 - * Fetch an sqe, if one is available. Note that req->sqe will point to memory 3688 + * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory 3707 3689 * that is mapped by userspace. This means that care needs to be taken to 3708 3690 * ensure that reads are stable, as we cannot rely on userspace always 3709 3691 * being a good citizen. If members of the sqe are validated and then later 3710 3692 * used, it's important that those reads are done through READ_ONCE() to 3711 3693 * prevent a re-load down the line. 3712 3694 */ 3713 - static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) 3695 + static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, 3696 + const struct io_uring_sqe **sqe_ptr) 3714 3697 { 3715 3698 struct io_rings *rings = ctx->rings; 3716 3699 u32 *sq_array = ctx->sq_array; ··· 3738 3719 * link list. 3739 3720 */ 3740 3721 req->sequence = ctx->cached_sq_head; 3741 - req->sqe = &ctx->sq_sqes[head]; 3742 - req->opcode = READ_ONCE(req->sqe->opcode); 3743 - req->user_data = READ_ONCE(req->sqe->user_data); 3722 + *sqe_ptr = &ctx->sq_sqes[head]; 3723 + req->opcode = READ_ONCE((*sqe_ptr)->opcode); 3724 + req->user_data = READ_ONCE((*sqe_ptr)->user_data); 3744 3725 ctx->cached_sq_head++; 3745 3726 return true; 3746 3727 } ··· 3772 3753 } 3773 3754 3774 3755 for (i = 0; i < nr; i++) { 3756 + const struct io_uring_sqe *sqe; 3775 3757 struct io_kiocb *req; 3776 3758 unsigned int sqe_flags; 3777 3759 ··· 3782 3762 submitted = -EAGAIN; 3783 3763 break; 3784 3764 } 3785 - if (!io_get_sqring(ctx, req)) { 3765 + if (!io_get_sqring(ctx, req, &sqe)) { 3786 3766 __io_free_req(req); 3787 3767 break; 3788 3768 } ··· 3796 3776 } 3797 3777 3798 3778 submitted++; 3799 - sqe_flags = req->sqe->flags; 3779 + sqe_flags = sqe->flags; 3800 3780 3801 3781 req->ring_file = ring_file; 3802 3782 req->ring_fd = ring_fd; ··· 3804 3784 req->in_async = async; 3805 3785 req->needs_fixed_file = async; 3806 3786 trace_io_uring_submit_sqe(ctx, req->user_data, true, async); 3807 - if (!io_submit_sqe(req, statep, &link)) 3787 + if (!io_submit_sqe(req, sqe, statep, &link)) 3808 3788 break; 3809 3789 /* 3810 3790 * If previous wasn't linked and we have a linked command, ··· 4722 4702 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 4723 4703 return -EFAULT; 4724 4704 4725 - dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; 4705 + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 4726 4706 dst->iov_len = ciov.iov_len; 4727 4707 return 0; 4728 4708 }