Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.15/io_uring-20250322' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
"This is the first of the io_uring pull requests for the 6.15 merge
window, there will be others once the net tree has gone in. This
contains:

- Cleanup and unification of cancelation handling across various
request types.

- Improvement for bundles, supporting them both for incrementally
consumed buffers, and for non-multishot requests.

- Enable toggling of using iowait while waiting on io_uring events or
not. Unfortunately this is still tied with CPU frequency boosting
on short waits, as the scheduler side has not been very receptive
to splitting the (useless) iowait stat from the cpufreq implied
boost.

- Add support for kbuf nodes, enabling zero-copy support for the ublk
block driver.

- Various cleanups for resource node handling.

- Series greatly cleaning up the legacy provided (non-ring based)
buffers. For years, we've been pushing the ring provided buffers as
the way to go, and that is what people have been using. Reduce the
complexity and code associated with legacy provided buffers.

- Series cleaning up the compat handling.

- Series improving and cleaning up the recvmsg/sendmsg iovec and msg
handling.

- Series of cleanups for io-wq.

- Start adding a bunch of selftests. The liburing repository
generally carries feature and regression tests for everything, but
at least for ublk initially, we'll try and go the route of having
it in selftests as well. We'll see how this goes, might decide to
migrate more tests this way in the future.

- Various little cleanups and fixes"

* tag 'for-6.15/io_uring-20250322' of git://git.kernel.dk/linux: (108 commits)
selftests: ublk: add stripe target
selftests: ublk: simplify loop io completion
selftests: ublk: enable zero copy for null target
selftests: ublk: prepare for supporting stripe target
selftests: ublk: move common code into common.c
selftests: ublk: increase max buffer size to 1MB
selftests: ublk: add single sqe allocator helper
selftests: ublk: add generic_01 for verifying sequential IO order
selftests: ublk: fix starting ublk device
io_uring: enable toggle of iowait usage when waiting on CQEs
selftests: ublk: fix write cache implementation
selftests: ublk: add variable for user to not show test result
selftests: ublk: don't show `modprobe` failure
selftests: ublk: add one dependency header
io_uring/kbuf: enable bundles for incrementally consumed buffers
Revert "io_uring/rsrc: simplify the bvec iter count calculation"
selftests: ublk: improve test usability
selftests: ublk: add stress test for covering IO vs. killing ublk server
selftests: ublk: add one stress test for covering IO vs. removing device
selftests: ublk: load/unload ublk_drv when preparing & cleaning up tests
...

+3884 -873
+1
MAINTAINERS
··· 24397 24397 F: Documentation/block/ublk.rst 24398 24398 F: drivers/block/ublk_drv.c 24399 24399 F: include/uapi/linux/ublk_cmd.h 24400 + F: tools/testing/selftests/ublk/ 24400 24401 24401 24402 UBSAN 24402 24403 M: Kees Cook <kees@kernel.org>
+49 -7
drivers/block/ublk_drv.c
··· 51 51 /* private ioctl command mirror */ 52 52 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 53 53 54 + #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) 55 + #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) 56 + 54 57 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 55 58 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 56 59 | UBLK_F_URING_CMD_COMP_IN_TASK \ ··· 199 196 200 197 static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); 201 198 199 + static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 200 + struct ublk_queue *ubq, int tag, size_t offset); 202 201 static inline unsigned int ublk_req_build_flags(struct request *req); 203 202 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 204 203 int tag); 205 204 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) 206 205 { 207 - return ub->dev_info.flags & UBLK_F_USER_COPY; 206 + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); 208 207 } 209 208 210 209 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) ··· 586 581 587 582 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) 588 583 { 589 - return ubq->flags & UBLK_F_USER_COPY; 584 + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); 590 585 } 591 586 592 587 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) ··· 1752 1747 io_uring_cmd_mark_cancelable(cmd, issue_flags); 1753 1748 } 1754 1749 1750 + static void ublk_io_release(void *priv) 1751 + { 1752 + struct request *rq = priv; 1753 + struct ublk_queue *ubq = rq->mq_hctx->driver_data; 1754 + 1755 + ublk_put_req_ref(ubq, rq); 1756 + } 1757 + 1758 + static int ublk_register_io_buf(struct io_uring_cmd *cmd, 1759 + struct ublk_queue *ubq, unsigned int tag, 1760 + unsigned int index, unsigned int issue_flags) 1761 + { 1762 + struct ublk_device *ub = cmd->file->private_data; 1763 + struct request *req; 1764 + int ret; 1765 + 1766 + req = __ublk_check_and_get_req(ub, ubq, tag, 0); 1767 + if (!req) 1768 + return -EINVAL; 1769 + 1770 + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 1771 + issue_flags); 1772 + if (ret) { 1773 + ublk_put_req_ref(ubq, req); 1774 + return ret; 1775 + } 1776 + 1777 + return 0; 1778 + } 1779 + 1780 + static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, 1781 + unsigned int index, unsigned int issue_flags) 1782 + { 1783 + return io_buffer_unregister_bvec(cmd, index, issue_flags); 1784 + } 1785 + 1755 1786 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, 1756 1787 unsigned int issue_flags, 1757 1788 const struct ublksrv_io_cmd *ub_cmd) ··· 1839 1798 1840 1799 ret = -EINVAL; 1841 1800 switch (_IOC_NR(cmd_op)) { 1801 + case UBLK_IO_REGISTER_IO_BUF: 1802 + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); 1803 + case UBLK_IO_UNREGISTER_IO_BUF: 1804 + return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); 1842 1805 case UBLK_IO_FETCH_REQ: 1843 1806 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ 1844 1807 if (ublk_queue_ready(ubq)) { ··· 2504 2459 * buffer by pwrite() to ublk char device, which can't be 2505 2460 * used for unprivileged device 2506 2461 */ 2507 - if (info.flags & UBLK_F_USER_COPY) 2462 + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) 2508 2463 return -EINVAL; 2509 2464 } 2510 2465 ··· 2571 2526 ret = -EINVAL; 2572 2527 goto out_free_dev_number; 2573 2528 } 2574 - 2575 - /* We are not ready to support zero copy */ 2576 - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; 2577 2529 2578 2530 ub->dev_info.nr_hw_queues = min_t(unsigned int, 2579 2531 ub->dev_info.nr_hw_queues, nr_cpu_ids); ··· 2905 2863 { 2906 2864 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2907 2865 void __user *argp = (void __user *)(unsigned long)header->addr; 2908 - u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; 2866 + u64 features = UBLK_F_ALL; 2909 2867 2910 2868 if (header->len != UBLK_FEATURES_LEN || !header->addr) 2911 2869 return -EINVAL;
+7 -5
drivers/nvme/host/ioctl.c
··· 114 114 115 115 static int nvme_map_user_request(struct request *req, u64 ubuffer, 116 116 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 117 - struct io_uring_cmd *ioucmd, unsigned int flags) 117 + struct io_uring_cmd *ioucmd, unsigned int flags, 118 + unsigned int iou_issue_flags) 118 119 { 119 120 struct request_queue *q = req->q; 120 121 struct nvme_ns *ns = q->queuedata; ··· 147 146 goto out; 148 147 } 149 148 ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 150 - rq_data_dir(req), &iter, ioucmd); 149 + rq_data_dir(req), &iter, ioucmd, 150 + iou_issue_flags); 151 151 if (ret < 0) 152 152 goto out; 153 153 ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); ··· 200 198 req->timeout = timeout; 201 199 if (ubuffer && bufflen) { 202 200 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 203 - meta_len, NULL, flags); 201 + meta_len, NULL, flags, 0); 204 202 if (ret) 205 203 return ret; 206 204 } ··· 516 514 return PTR_ERR(req); 517 515 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 518 516 519 - if (d.addr && d.data_len) { 517 + if (d.data_len) { 520 518 ret = nvme_map_user_request(req, d.addr, 521 519 d.data_len, nvme_to_user_ptr(d.metadata), 522 - d.metadata_len, ioucmd, vec); 520 + d.metadata_len, ioucmd, vec, issue_flags); 523 521 if (ret) 524 522 return ret; 525 523 }
+14 -3
include/linux/io_uring/cmd.h
··· 4 4 5 5 #include <uapi/linux/io_uring.h> 6 6 #include <linux/io_uring_types.h> 7 + #include <linux/blk-mq.h> 7 8 8 9 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ 9 10 #define IORING_URING_CMD_CANCELABLE (1U << 30) ··· 40 39 41 40 #if defined(CONFIG_IO_URING) 42 41 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 43 - struct iov_iter *iter, void *ioucmd); 42 + struct iov_iter *iter, 43 + struct io_uring_cmd *ioucmd, 44 + unsigned int issue_flags); 44 45 45 46 /* 46 47 * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd ··· 69 66 void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); 70 67 71 68 #else 72 - static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 73 - struct iov_iter *iter, void *ioucmd) 69 + static inline int 70 + io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 71 + struct iov_iter *iter, struct io_uring_cmd *ioucmd, 72 + unsigned int issue_flags) 74 73 { 75 74 return -EOPNOTSUPP; 76 75 } ··· 127 122 { 128 123 return cmd_to_io_kiocb(cmd)->async_data; 129 124 } 125 + 126 + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 127 + void (*release)(void *), unsigned int index, 128 + unsigned int issue_flags); 129 + int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 130 + unsigned int issue_flags); 130 131 131 132 #endif /* _LINUX_IO_URING_CMD_H */
+15 -5
include/linux/io_uring_types.h
··· 292 292 293 293 struct io_file_table file_table; 294 294 struct io_rsrc_data buf_table; 295 + struct io_alloc_cache node_cache; 296 + struct io_alloc_cache imu_cache; 295 297 296 298 struct io_submit_state submit_state; 297 299 ··· 362 360 363 361 spinlock_t completion_lock; 364 362 365 - struct list_head io_buffers_comp; 366 363 struct list_head cq_overflow_list; 367 364 368 365 struct hlist_head waitid_list; ··· 379 378 380 379 unsigned int file_alloc_start; 381 380 unsigned int file_alloc_end; 382 - 383 - struct list_head io_buffers_cache; 384 381 385 382 /* Keep this last, we don't need it for the fast path */ 386 383 struct wait_queue_head poll_wq; ··· 438 439 struct io_mapped_region param_region; 439 440 }; 440 441 442 + /* 443 + * Token indicating function is called in task work context: 444 + * ctx->uring_lock is held and any completions generated will be flushed. 445 + * ONLY core io_uring.c should instantiate this struct. 446 + */ 441 447 struct io_tw_state { 442 448 }; 449 + /* Alias to use in code that doesn't instantiate struct io_tw_state */ 450 + typedef struct io_tw_state io_tw_token_t; 443 451 444 452 enum { 445 453 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, ··· 572 566 REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), 573 567 }; 574 568 575 - typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); 569 + typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); 576 570 577 571 struct io_task_work { 578 572 struct llist_node node; ··· 607 601 io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ 608 602 ((cmd_type *)&(req)->cmd) \ 609 603 ) 610 - #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) 604 + 605 + static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) 606 + { 607 + return ptr; 608 + } 611 609 612 610 struct io_kiocb { 613 611 union {
+2
include/uapi/linux/io_uring.h
··· 541 541 #define IORING_ENTER_REGISTERED_RING (1U << 4) 542 542 #define IORING_ENTER_ABS_TIMER (1U << 5) 543 543 #define IORING_ENTER_EXT_ARG_REG (1U << 6) 544 + #define IORING_ENTER_NO_IOWAIT (1U << 7) 544 545 545 546 /* 546 547 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 579 578 #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) 580 579 #define IORING_FEAT_MIN_TIMEOUT (1U << 15) 581 580 #define IORING_FEAT_RW_ATTR (1U << 16) 581 + #define IORING_FEAT_NO_IOWAIT (1U << 17) 582 582 583 583 /* 584 584 * io_uring_register(2) opcodes and arguments
+4
include/uapi/linux/ublk_cmd.h
··· 94 94 _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) 95 95 #define UBLK_U_IO_NEED_GET_DATA \ 96 96 _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) 97 + #define UBLK_U_IO_REGISTER_IO_BUF \ 98 + _IOWR('u', 0x23, struct ublksrv_io_cmd) 99 + #define UBLK_U_IO_UNREGISTER_IO_BUF \ 100 + _IOWR('u', 0x24, struct ublksrv_io_cmd) 97 101 98 102 /* only ABORT means that no re-fetch */ 99 103 #define UBLK_IO_RES_OK 0
+6
io_uring/alloc_cache.h
··· 68 68 return io_cache_alloc_new(cache, gfp); 69 69 } 70 70 71 + static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) 72 + { 73 + if (!io_alloc_cache_put(cache, obj)) 74 + kfree(obj); 75 + } 76 + 71 77 #endif
+42
io_uring/cancel.c
··· 341 341 fput(file); 342 342 return ret; 343 343 } 344 + 345 + bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 346 + struct hlist_head *list, bool cancel_all, 347 + bool (*cancel)(struct io_kiocb *)) 348 + { 349 + struct hlist_node *tmp; 350 + struct io_kiocb *req; 351 + bool found = false; 352 + 353 + lockdep_assert_held(&ctx->uring_lock); 354 + 355 + hlist_for_each_entry_safe(req, tmp, list, hash_node) { 356 + if (!io_match_task_safe(req, tctx, cancel_all)) 357 + continue; 358 + hlist_del_init(&req->hash_node); 359 + if (cancel(req)) 360 + found = true; 361 + } 362 + 363 + return found; 364 + } 365 + 366 + int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 367 + unsigned int issue_flags, struct hlist_head *list, 368 + bool (*cancel)(struct io_kiocb *)) 369 + { 370 + struct hlist_node *tmp; 371 + struct io_kiocb *req; 372 + int nr = 0; 373 + 374 + io_ring_submit_lock(ctx, issue_flags); 375 + hlist_for_each_entry_safe(req, tmp, list, hash_node) { 376 + if (!io_cancel_req_match(req, cd)) 377 + continue; 378 + if (cancel(req)) 379 + nr++; 380 + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 381 + break; 382 + } 383 + io_ring_submit_unlock(ctx, issue_flags); 384 + return nr ?: -ENOENT; 385 + }
+8
io_uring/cancel.h
··· 24 24 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); 25 25 bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); 26 26 27 + bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 28 + struct hlist_head *list, bool cancel_all, 29 + bool (*cancel)(struct io_kiocb *)); 30 + 31 + int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 32 + unsigned int issue_flags, struct hlist_head *list, 33 + bool (*cancel)(struct io_kiocb *)); 34 + 27 35 static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) 28 36 { 29 37 if (req->cancel_seq_set && sequence == req->work.cancel_seq)
+1 -1
io_uring/filetable.c
··· 68 68 if (slot_index >= ctx->file_table.data.nr) 69 69 return -EINVAL; 70 70 71 - node = io_rsrc_node_alloc(IORING_RSRC_FILE); 71 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 72 72 if (!node) 73 73 return -ENOMEM; 74 74
+12 -50
io_uring/futex.c
··· 44 44 io_alloc_cache_free(&ctx->futex_cache, kfree); 45 45 } 46 46 47 - static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) 47 + static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 48 48 { 49 49 req->async_data = NULL; 50 50 hlist_del_init(&req->hash_node); 51 - io_req_task_complete(req, ts); 51 + io_req_task_complete(req, tw); 52 52 } 53 53 54 - static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) 54 + static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 55 55 { 56 - struct io_futex_data *ifd = req->async_data; 57 56 struct io_ring_ctx *ctx = req->ctx; 58 57 59 - io_tw_lock(ctx, ts); 60 - if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) 61 - kfree(ifd); 62 - __io_futex_complete(req, ts); 58 + io_tw_lock(ctx, tw); 59 + io_cache_free(&ctx->futex_cache, req->async_data); 60 + __io_futex_complete(req, tw); 63 61 } 64 62 65 - static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) 63 + static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) 66 64 { 67 65 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 68 66 struct futex_vector *futexv = req->async_data; 69 67 70 - io_tw_lock(req->ctx, ts); 68 + io_tw_lock(req->ctx, tw); 71 69 72 70 if (!iof->futexv_unqueued) { 73 71 int res; ··· 77 79 78 80 kfree(req->async_data); 79 81 req->flags &= ~REQ_F_ASYNC_DATA; 80 - __io_futex_complete(req, ts); 82 + __io_futex_complete(req, tw); 81 83 } 82 84 83 85 static bool io_futexv_claim(struct io_futex *iof) ··· 88 90 return true; 89 91 } 90 92 91 - static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 93 + static bool __io_futex_cancel(struct io_kiocb *req) 92 94 { 93 95 /* futex wake already done or in progress */ 94 96 if (req->opcode == IORING_OP_FUTEX_WAIT) { ··· 114 116 int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 115 117 unsigned int issue_flags) 116 118 { 117 - struct hlist_node *tmp; 118 - struct io_kiocb *req; 119 - int nr = 0; 120 - 121 - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) 122 - return -ENOENT; 123 - 124 - io_ring_submit_lock(ctx, issue_flags); 125 - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { 126 - if (req->cqe.user_data != cd->data && 127 - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) 128 - continue; 129 - if (__io_futex_cancel(ctx, req)) 130 - nr++; 131 - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 132 - break; 133 - } 134 - io_ring_submit_unlock(ctx, issue_flags); 135 - 136 - if (nr) 137 - return nr; 138 - 139 - return -ENOENT; 119 + return io_cancel_remove(ctx, cd, issue_flags, &ctx->futex_list, __io_futex_cancel); 140 120 } 141 121 142 122 bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 143 123 bool cancel_all) 144 124 { 145 - struct hlist_node *tmp; 146 - struct io_kiocb *req; 147 - bool found = false; 148 - 149 - lockdep_assert_held(&ctx->uring_lock); 150 - 151 - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { 152 - if (!io_match_task_safe(req, tctx, cancel_all)) 153 - continue; 154 - hlist_del_init(&req->hash_node); 155 - __io_futex_cancel(ctx, req); 156 - found = true; 157 - } 158 - 159 - return found; 125 + return io_cancel_remove_all(ctx, tctx, &ctx->futex_list, cancel_all, __io_futex_cancel); 160 126 } 161 127 162 128 int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+131 -99
io_uring/io-wq.c
··· 30 30 IO_WORKER_F_UP = 0, /* up and active */ 31 31 IO_WORKER_F_RUNNING = 1, /* account as running */ 32 32 IO_WORKER_F_FREE = 2, /* worker on free list */ 33 - IO_WORKER_F_BOUND = 3, /* is doing bounded work */ 34 33 }; 35 34 36 35 enum { ··· 45 46 */ 46 47 struct io_worker { 47 48 refcount_t ref; 48 - int create_index; 49 49 unsigned long flags; 50 50 struct hlist_nulls_node nulls_node; 51 51 struct list_head all_list; 52 52 struct task_struct *task; 53 53 struct io_wq *wq; 54 + struct io_wq_acct *acct; 54 55 55 56 struct io_wq_work *cur_work; 56 57 raw_spinlock_t lock; ··· 76 77 #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) 77 78 78 79 struct io_wq_acct { 80 + /** 81 + * Protects access to the worker lists. 82 + */ 83 + raw_spinlock_t workers_lock; 84 + 79 85 unsigned nr_workers; 80 86 unsigned max_workers; 81 - int index; 82 87 atomic_t nr_running; 88 + 89 + /** 90 + * The list of free workers. Protected by #workers_lock 91 + * (write) and RCU (read). 92 + */ 93 + struct hlist_nulls_head free_list; 94 + 95 + /** 96 + * The list of all workers. Protected by #workers_lock 97 + * (write) and RCU (read). 98 + */ 99 + struct list_head all_list; 100 + 83 101 raw_spinlock_t lock; 84 102 struct io_wq_work_list work_list; 85 103 unsigned long flags; ··· 128 112 129 113 struct io_wq_acct acct[IO_WQ_ACCT_NR]; 130 114 131 - /* lock protects access to elements below */ 132 - raw_spinlock_t lock; 133 - 134 - struct hlist_nulls_head free_list; 135 - struct list_head all_list; 136 - 137 115 struct wait_queue_entry wait; 138 116 139 117 struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; ··· 145 135 bool cancel_all; 146 136 }; 147 137 148 - static bool create_io_worker(struct io_wq *wq, int index); 138 + static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); 149 139 static void io_wq_dec_running(struct io_worker *worker); 150 140 static bool io_acct_cancel_pending_work(struct io_wq *wq, 151 141 struct io_wq_acct *acct, ··· 170 160 } 171 161 172 162 static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, 173 - struct io_wq_work *work) 163 + unsigned int work_flags) 174 164 { 175 - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); 165 + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); 176 166 } 177 167 178 168 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) 179 169 { 180 - return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); 170 + return worker->acct; 181 171 } 182 172 183 173 static void io_worker_ref_put(struct io_wq *wq) ··· 202 192 struct io_wq *wq = worker->wq; 203 193 204 194 atomic_dec(&acct->nr_running); 205 - raw_spin_lock(&wq->lock); 195 + raw_spin_lock(&acct->workers_lock); 206 196 acct->nr_workers--; 207 - raw_spin_unlock(&wq->lock); 197 + raw_spin_unlock(&acct->workers_lock); 208 198 io_worker_ref_put(wq); 209 199 clear_bit_unlock(0, &worker->create_state); 210 200 io_worker_release(worker); ··· 223 213 static void io_worker_exit(struct io_worker *worker) 224 214 { 225 215 struct io_wq *wq = worker->wq; 216 + struct io_wq_acct *acct = io_wq_get_acct(worker); 226 217 227 218 while (1) { 228 219 struct callback_head *cb = task_work_cancel_match(wq->task, ··· 237 226 io_worker_release(worker); 238 227 wait_for_completion(&worker->ref_done); 239 228 240 - raw_spin_lock(&wq->lock); 229 + raw_spin_lock(&acct->workers_lock); 241 230 if (test_bit(IO_WORKER_F_FREE, &worker->flags)) 242 231 hlist_nulls_del_rcu(&worker->nulls_node); 243 232 list_del_rcu(&worker->all_list); 244 - raw_spin_unlock(&wq->lock); 233 + raw_spin_unlock(&acct->workers_lock); 245 234 io_wq_dec_running(worker); 246 235 /* 247 236 * this worker is a goner, clear ->worker_private to avoid any ··· 280 269 * Check head of free list for an available worker. If one isn't available, 281 270 * caller must create one. 282 271 */ 283 - static bool io_wq_activate_free_worker(struct io_wq *wq, 284 - struct io_wq_acct *acct) 272 + static bool io_acct_activate_free_worker(struct io_wq_acct *acct) 285 273 __must_hold(RCU) 286 274 { 287 275 struct hlist_nulls_node *n; ··· 291 281 * activate. If a given worker is on the free_list but in the process 292 282 * of exiting, keep trying. 293 283 */ 294 - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { 284 + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { 295 285 if (!io_worker_get(worker)) 296 286 continue; 297 - if (io_wq_get_acct(worker) != acct) { 298 - io_worker_release(worker); 299 - continue; 300 - } 301 287 /* 302 288 * If the worker is already running, it's either already 303 289 * starting work or finishing work. In either case, if it does ··· 320 314 if (unlikely(!acct->max_workers)) 321 315 pr_warn_once("io-wq is not configured for unbound workers"); 322 316 323 - raw_spin_lock(&wq->lock); 317 + raw_spin_lock(&acct->workers_lock); 324 318 if (acct->nr_workers >= acct->max_workers) { 325 - raw_spin_unlock(&wq->lock); 319 + raw_spin_unlock(&acct->workers_lock); 326 320 return true; 327 321 } 328 322 acct->nr_workers++; 329 - raw_spin_unlock(&wq->lock); 323 + raw_spin_unlock(&acct->workers_lock); 330 324 atomic_inc(&acct->nr_running); 331 325 atomic_inc(&wq->worker_refs); 332 - return create_io_worker(wq, acct->index); 326 + return create_io_worker(wq, acct); 333 327 } 334 328 335 329 static void io_wq_inc_running(struct io_worker *worker) ··· 349 343 350 344 worker = container_of(cb, struct io_worker, create_work); 351 345 wq = worker->wq; 352 - acct = &wq->acct[worker->create_index]; 353 - raw_spin_lock(&wq->lock); 346 + acct = worker->acct; 347 + raw_spin_lock(&acct->workers_lock); 354 348 355 349 if (acct->nr_workers < acct->max_workers) { 356 350 acct->nr_workers++; 357 351 do_create = true; 358 352 } 359 - raw_spin_unlock(&wq->lock); 353 + raw_spin_unlock(&acct->workers_lock); 360 354 if (do_create) { 361 - create_io_worker(wq, worker->create_index); 355 + create_io_worker(wq, acct); 362 356 } else { 363 357 atomic_dec(&acct->nr_running); 364 358 io_worker_ref_put(wq); ··· 390 384 391 385 atomic_inc(&wq->worker_refs); 392 386 init_task_work(&worker->create_work, func); 393 - worker->create_index = acct->index; 394 387 if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { 395 388 /* 396 389 * EXIT may have been set after checking it above, check after ··· 435 430 * Worker will start processing some work. Move it to the busy list, if 436 431 * it's currently on the freelist 437 432 */ 438 - static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) 433 + static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) 439 434 { 440 435 if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { 441 436 clear_bit(IO_WORKER_F_FREE, &worker->flags); 442 - raw_spin_lock(&wq->lock); 437 + raw_spin_lock(&acct->workers_lock); 443 438 hlist_nulls_del_init_rcu(&worker->nulls_node); 444 - raw_spin_unlock(&wq->lock); 439 + raw_spin_unlock(&acct->workers_lock); 445 440 } 446 441 } 447 442 448 443 /* 449 444 * No work, worker going to sleep. Move to freelist. 450 445 */ 451 - static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) 452 - __must_hold(wq->lock) 446 + static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) 447 + __must_hold(acct->workers_lock) 453 448 { 454 449 if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { 455 450 set_bit(IO_WORKER_F_FREE, &worker->flags); 456 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 451 + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); 457 452 } 453 + } 454 + 455 + static inline unsigned int __io_get_work_hash(unsigned int work_flags) 456 + { 457 + return work_flags >> IO_WQ_HASH_SHIFT; 458 458 } 459 459 460 460 static inline unsigned int io_get_work_hash(struct io_wq_work *work) 461 461 { 462 - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; 462 + return __io_get_work_hash(atomic_read(&work->flags)); 463 463 } 464 464 465 465 static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) ··· 485 475 } 486 476 487 477 static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, 488 - struct io_worker *worker) 478 + struct io_wq *wq) 489 479 __must_hold(acct->lock) 490 480 { 491 481 struct io_wq_work_node *node, *prev; 492 482 struct io_wq_work *work, *tail; 493 483 unsigned int stall_hash = -1U; 494 - struct io_wq *wq = worker->wq; 495 484 496 485 wq_list_for_each(node, prev, &acct->work_list) { 486 + unsigned int work_flags; 497 487 unsigned int hash; 498 488 499 489 work = container_of(node, struct io_wq_work, list); 500 490 501 491 /* not hashed, can run anytime */ 502 - if (!io_wq_is_hashed(work)) { 492 + work_flags = atomic_read(&work->flags); 493 + if (!__io_wq_is_hashed(work_flags)) { 503 494 wq_list_del(&acct->work_list, node, prev); 504 495 return work; 505 496 } 506 497 507 - hash = io_get_work_hash(work); 498 + hash = __io_get_work_hash(work_flags); 508 499 /* all items with this hash lie in [work, tail] */ 509 500 tail = wq->hash_tail[hash]; 510 501 ··· 575 564 * can't make progress, any work completion or insertion will 576 565 * clear the stalled flag. 577 566 */ 578 - work = io_get_next_work(acct, worker); 567 + work = io_get_next_work(acct, wq); 579 568 if (work) { 580 569 /* 581 570 * Make sure cancelation can find this, even before ··· 594 583 if (!work) 595 584 break; 596 585 597 - __io_worker_busy(wq, worker); 586 + __io_worker_busy(acct, worker); 598 587 599 588 io_assign_current_work(worker, work); 600 589 __set_current_state(TASK_RUNNING); ··· 602 591 /* handle a whole dependent link */ 603 592 do { 604 593 struct io_wq_work *next_hashed, *linked; 605 - unsigned int hash = io_get_work_hash(work); 594 + unsigned int work_flags = atomic_read(&work->flags); 595 + unsigned int hash = __io_wq_is_hashed(work_flags) 596 + ? __io_get_work_hash(work_flags) 597 + : -1U; 606 598 607 599 next_hashed = wq_next_work(work); 608 600 609 601 if (do_kill && 610 - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) 602 + (work_flags & IO_WQ_WORK_UNBOUND)) 611 603 atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 612 604 wq->do_work(work); 613 605 io_assign_current_work(worker, NULL); ··· 668 654 while (io_acct_run_queue(acct)) 669 655 io_worker_handle_work(acct, worker); 670 656 671 - raw_spin_lock(&wq->lock); 657 + raw_spin_lock(&acct->workers_lock); 672 658 /* 673 659 * Last sleep timed out. Exit if we're not the last worker, 674 660 * or if someone modified our affinity. 675 661 */ 676 662 if (last_timeout && (exit_mask || acct->nr_workers > 1)) { 677 663 acct->nr_workers--; 678 - raw_spin_unlock(&wq->lock); 664 + raw_spin_unlock(&acct->workers_lock); 679 665 __set_current_state(TASK_RUNNING); 680 666 break; 681 667 } 682 668 last_timeout = false; 683 - __io_worker_idle(wq, worker); 684 - raw_spin_unlock(&wq->lock); 669 + __io_worker_idle(acct, worker); 670 + raw_spin_unlock(&acct->workers_lock); 685 671 if (io_run_task_work()) 686 672 continue; 687 673 ret = schedule_timeout(WORKER_IDLE_TIMEOUT); ··· 742 728 io_wq_dec_running(worker); 743 729 } 744 730 745 - static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, 731 + static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, 746 732 struct task_struct *tsk) 747 733 { 748 734 tsk->worker_private = worker; 749 735 worker->task = tsk; 750 736 set_cpus_allowed_ptr(tsk, wq->cpu_mask); 751 737 752 - raw_spin_lock(&wq->lock); 753 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 754 - list_add_tail_rcu(&worker->all_list, &wq->all_list); 738 + raw_spin_lock(&acct->workers_lock); 739 + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); 740 + list_add_tail_rcu(&worker->all_list, &acct->all_list); 755 741 set_bit(IO_WORKER_F_FREE, &worker->flags); 756 - raw_spin_unlock(&wq->lock); 742 + raw_spin_unlock(&acct->workers_lock); 757 743 wake_up_new_task(tsk); 758 744 } 759 745 ··· 801 787 struct io_worker *worker; 802 788 struct task_struct *tsk; 803 789 struct io_wq *wq; 790 + struct io_wq_acct *acct; 804 791 805 792 worker = container_of(cb, struct io_worker, create_work); 806 793 clear_bit_unlock(0, &worker->create_state); 807 794 wq = worker->wq; 795 + acct = io_wq_get_acct(worker); 808 796 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 809 797 if (!IS_ERR(tsk)) { 810 - io_init_new_worker(wq, worker, tsk); 798 + io_init_new_worker(wq, acct, worker, tsk); 811 799 io_worker_release(worker); 812 800 return; 813 801 } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 814 - struct io_wq_acct *acct = io_wq_get_acct(worker); 815 - 816 802 atomic_dec(&acct->nr_running); 817 - raw_spin_lock(&wq->lock); 803 + raw_spin_lock(&acct->workers_lock); 818 804 acct->nr_workers--; 819 805 if (!acct->nr_workers) { 820 806 struct io_cb_cancel_data match = { ··· 822 808 .cancel_all = true, 823 809 }; 824 810 825 - raw_spin_unlock(&wq->lock); 811 + raw_spin_unlock(&acct->workers_lock); 826 812 while (io_acct_cancel_pending_work(wq, acct, &match)) 827 813 ; 828 814 } else { 829 - raw_spin_unlock(&wq->lock); 815 + raw_spin_unlock(&acct->workers_lock); 830 816 } 831 817 io_worker_ref_put(wq); 832 818 kfree(worker); ··· 848 834 kfree(worker); 849 835 } 850 836 851 - static bool create_io_worker(struct io_wq *wq, int index) 837 + static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) 852 838 { 853 - struct io_wq_acct *acct = &wq->acct[index]; 854 839 struct io_worker *worker; 855 840 struct task_struct *tsk; 856 841 ··· 859 846 if (!worker) { 860 847 fail: 861 848 atomic_dec(&acct->nr_running); 862 - raw_spin_lock(&wq->lock); 849 + raw_spin_lock(&acct->workers_lock); 863 850 acct->nr_workers--; 864 - raw_spin_unlock(&wq->lock); 851 + raw_spin_unlock(&acct->workers_lock); 865 852 io_worker_ref_put(wq); 866 853 return false; 867 854 } 868 855 869 856 refcount_set(&worker->ref, 1); 870 857 worker->wq = wq; 858 + worker->acct = acct; 871 859 raw_spin_lock_init(&worker->lock); 872 860 init_completion(&worker->ref_done); 873 861 874 - if (index == IO_WQ_ACCT_BOUND) 875 - set_bit(IO_WORKER_F_BOUND, &worker->flags); 876 - 877 862 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 878 863 if (!IS_ERR(tsk)) { 879 - io_init_new_worker(wq, worker, tsk); 864 + io_init_new_worker(wq, acct, worker, tsk); 880 865 } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 881 866 kfree(worker); 882 867 goto fail; ··· 890 879 * Iterate the passed in list and call the specific function for each 891 880 * worker that isn't exiting 892 881 */ 893 - static bool io_wq_for_each_worker(struct io_wq *wq, 894 - bool (*func)(struct io_worker *, void *), 895 - void *data) 882 + static bool io_acct_for_each_worker(struct io_wq_acct *acct, 883 + bool (*func)(struct io_worker *, void *), 884 + void *data) 896 885 { 897 886 struct io_worker *worker; 898 887 bool ret = false; 899 888 900 - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { 889 + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { 901 890 if (io_worker_get(worker)) { 902 891 /* no task if node is/was offline */ 903 892 if (worker->task) ··· 909 898 } 910 899 911 900 return ret; 901 + } 902 + 903 + static bool io_wq_for_each_worker(struct io_wq *wq, 904 + bool (*func)(struct io_worker *, void *), 905 + void *data) 906 + { 907 + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { 908 + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) 909 + return false; 910 + } 911 + 912 + return true; 912 913 } 913 914 914 915 static bool io_wq_worker_wake(struct io_worker *worker, void *data) ··· 939 916 } while (work); 940 917 } 941 918 942 - static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) 919 + static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, 920 + struct io_wq_work *work, unsigned int work_flags) 943 921 { 944 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 945 922 unsigned int hash; 946 923 struct io_wq_work *tail; 947 924 948 - if (!io_wq_is_hashed(work)) { 925 + if (!__io_wq_is_hashed(work_flags)) { 949 926 append: 950 927 wq_list_add_tail(&work->list, &acct->work_list); 951 928 return; 952 929 } 953 930 954 - hash = io_get_work_hash(work); 931 + hash = __io_get_work_hash(work_flags); 955 932 tail = wq->hash_tail[hash]; 956 933 wq->hash_tail[hash] = work; 957 934 if (!tail) ··· 967 944 968 945 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) 969 946 { 970 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 971 947 unsigned int work_flags = atomic_read(&work->flags); 948 + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); 972 949 struct io_cb_cancel_data match = { 973 950 .fn = io_wq_work_match_item, 974 951 .data = work, ··· 987 964 } 988 965 989 966 raw_spin_lock(&acct->lock); 990 - io_wq_insert_work(wq, work); 967 + io_wq_insert_work(wq, acct, work, work_flags); 991 968 clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); 992 969 raw_spin_unlock(&acct->lock); 993 970 994 971 rcu_read_lock(); 995 - do_create = !io_wq_activate_free_worker(wq, acct); 972 + do_create = !io_acct_activate_free_worker(acct); 996 973 rcu_read_unlock(); 997 974 998 975 if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || ··· 1003 980 if (likely(did_create)) 1004 981 return; 1005 982 1006 - raw_spin_lock(&wq->lock); 983 + raw_spin_lock(&acct->workers_lock); 1007 984 if (acct->nr_workers) { 1008 - raw_spin_unlock(&wq->lock); 985 + raw_spin_unlock(&acct->workers_lock); 1009 986 return; 1010 987 } 1011 - raw_spin_unlock(&wq->lock); 988 + raw_spin_unlock(&acct->workers_lock); 1012 989 1013 990 /* fatal condition, failed to create the first worker */ 1014 991 io_acct_cancel_pending_work(wq, acct, &match); ··· 1057 1034 } 1058 1035 1059 1036 static inline void io_wq_remove_pending(struct io_wq *wq, 1037 + struct io_wq_acct *acct, 1060 1038 struct io_wq_work *work, 1061 1039 struct io_wq_work_node *prev) 1062 1040 { 1063 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 1064 1041 unsigned int hash = io_get_work_hash(work); 1065 1042 struct io_wq_work *prev_work = NULL; 1066 1043 ··· 1087 1064 work = container_of(node, struct io_wq_work, list); 1088 1065 if (!match->fn(work, match->data)) 1089 1066 continue; 1090 - io_wq_remove_pending(wq, work, prev); 1067 + io_wq_remove_pending(wq, acct, work, prev); 1091 1068 raw_spin_unlock(&acct->lock); 1092 1069 io_run_cancel(work, wq); 1093 1070 match->nr_pending++; ··· 1115 1092 } 1116 1093 } 1117 1094 1095 + static void io_acct_cancel_running_work(struct io_wq_acct *acct, 1096 + struct io_cb_cancel_data *match) 1097 + { 1098 + raw_spin_lock(&acct->workers_lock); 1099 + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); 1100 + raw_spin_unlock(&acct->workers_lock); 1101 + } 1102 + 1118 1103 static void io_wq_cancel_running_work(struct io_wq *wq, 1119 1104 struct io_cb_cancel_data *match) 1120 1105 { 1121 1106 rcu_read_lock(); 1122 - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); 1107 + 1108 + for (int i = 0; i < IO_WQ_ACCT_NR; i++) 1109 + io_acct_cancel_running_work(&wq->acct[i], match); 1110 + 1123 1111 rcu_read_unlock(); 1124 1112 } 1125 1113 ··· 1153 1119 * as an indication that we attempt to signal cancellation. The 1154 1120 * completion will run normally in this case. 1155 1121 * 1156 - * Do both of these while holding the wq->lock, to ensure that 1122 + * Do both of these while holding the acct->workers_lock, to ensure that 1157 1123 * we'll find a work item regardless of state. 1158 1124 */ 1159 1125 io_wq_cancel_pending_work(wq, &match); 1160 1126 if (match.nr_pending && !match.cancel_all) 1161 1127 return IO_WQ_CANCEL_OK; 1162 1128 1163 - raw_spin_lock(&wq->lock); 1164 1129 io_wq_cancel_running_work(wq, &match); 1165 - raw_spin_unlock(&wq->lock); 1166 1130 if (match.nr_running && !match.cancel_all) 1167 1131 return IO_WQ_CANCEL_RUNNING; 1168 1132 ··· 1184 1152 struct io_wq_acct *acct = &wq->acct[i]; 1185 1153 1186 1154 if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) 1187 - io_wq_activate_free_worker(wq, acct); 1155 + io_acct_activate_free_worker(acct); 1188 1156 } 1189 1157 rcu_read_unlock(); 1190 1158 return 1; ··· 1222 1190 for (i = 0; i < IO_WQ_ACCT_NR; i++) { 1223 1191 struct io_wq_acct *acct = &wq->acct[i]; 1224 1192 1225 - acct->index = i; 1226 1193 atomic_set(&acct->nr_running, 0); 1194 + 1195 + raw_spin_lock_init(&acct->workers_lock); 1196 + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); 1197 + INIT_LIST_HEAD(&acct->all_list); 1198 + 1227 1199 INIT_WQ_LIST(&acct->work_list); 1228 1200 raw_spin_lock_init(&acct->lock); 1229 1201 } 1230 - 1231 - raw_spin_lock_init(&wq->lock); 1232 - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); 1233 - INIT_LIST_HEAD(&wq->all_list); 1234 1202 1235 1203 wq->task = get_task_struct(data->task); 1236 1204 atomic_set(&wq->worker_refs, 1); ··· 1417 1385 1418 1386 rcu_read_lock(); 1419 1387 1420 - raw_spin_lock(&wq->lock); 1421 1388 for (i = 0; i < IO_WQ_ACCT_NR; i++) { 1422 1389 acct = &wq->acct[i]; 1390 + raw_spin_lock(&acct->workers_lock); 1423 1391 prev[i] = max_t(int, acct->max_workers, prev[i]); 1424 1392 if (new_count[i]) 1425 1393 acct->max_workers = new_count[i]; 1394 + raw_spin_unlock(&acct->workers_lock); 1426 1395 } 1427 - raw_spin_unlock(&wq->lock); 1428 1396 rcu_read_unlock(); 1429 1397 1430 1398 for (i = 0; i < IO_WQ_ACCT_NR; i++)
+6 -1
io_uring/io-wq.h
··· 54 54 int io_wq_max_workers(struct io_wq *wq, int *new_count); 55 55 bool io_wq_worker_stopped(void); 56 56 57 + static inline bool __io_wq_is_hashed(unsigned int work_flags) 58 + { 59 + return work_flags & IO_WQ_WORK_HASHED; 60 + } 61 + 57 62 static inline bool io_wq_is_hashed(struct io_wq_work *work) 58 63 { 59 - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; 64 + return __io_wq_is_hashed(atomic_read(&work->flags)); 60 65 } 61 66 62 67 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
+145 -104
io_uring/io_uring.c
··· 110 110 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ 111 111 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) 112 112 113 + #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 114 + 113 115 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 114 116 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ 115 117 REQ_F_ASYNC_DATA) 116 118 117 - #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ 119 + #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ 118 120 REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) 119 121 120 122 #define IO_TCTX_REFS_CACHE_NR (1U << 10) ··· 133 131 134 132 /* requests with any of those set should undergo io_disarm_next() */ 135 133 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 136 - #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 137 134 138 135 /* 139 136 * No waiters. It's larger than any valid value of the tw counter ··· 255 254 percpu_ref_get(&ctx->refs); 256 255 mutex_lock(&ctx->uring_lock); 257 256 llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 258 - req->io_task_work.func(req, &ts); 257 + req->io_task_work.func(req, ts); 259 258 io_submit_flush_completions(ctx); 260 259 mutex_unlock(&ctx->uring_lock); 261 260 percpu_ref_put(&ctx->refs); ··· 281 280 for (i = 0; i < hash_buckets; i++) 282 281 INIT_HLIST_HEAD(&table->hbs[i].list); 283 282 return 0; 283 + } 284 + 285 + static void io_free_alloc_caches(struct io_ring_ctx *ctx) 286 + { 287 + io_alloc_cache_free(&ctx->apoll_cache, kfree); 288 + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 289 + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 290 + io_alloc_cache_free(&ctx->uring_cache, kfree); 291 + io_alloc_cache_free(&ctx->msg_cache, kfree); 292 + io_futex_cache_free(ctx); 293 + io_rsrc_cache_free(ctx); 284 294 } 285 295 286 296 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ··· 325 313 init_waitqueue_head(&ctx->sqo_sq_wait); 326 314 INIT_LIST_HEAD(&ctx->sqd_list); 327 315 INIT_LIST_HEAD(&ctx->cq_overflow_list); 328 - INIT_LIST_HEAD(&ctx->io_buffers_cache); 329 316 ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, 330 317 sizeof(struct async_poll), 0); 331 318 ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, ··· 339 328 ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, 340 329 sizeof(struct io_kiocb), 0); 341 330 ret |= io_futex_cache_init(ctx); 331 + ret |= io_rsrc_cache_init(ctx); 342 332 if (ret) 343 333 goto free_ref; 344 334 init_completion(&ctx->ref_comp); ··· 350 338 spin_lock_init(&ctx->completion_lock); 351 339 raw_spin_lock_init(&ctx->timeout_lock); 352 340 INIT_WQ_LIST(&ctx->iopoll_list); 353 - INIT_LIST_HEAD(&ctx->io_buffers_comp); 354 341 INIT_LIST_HEAD(&ctx->defer_list); 355 342 INIT_LIST_HEAD(&ctx->timeout_list); 356 343 INIT_LIST_HEAD(&ctx->ltimeout_list); ··· 371 360 free_ref: 372 361 percpu_ref_exit(&ctx->refs); 373 362 err: 374 - io_alloc_cache_free(&ctx->apoll_cache, kfree); 375 - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 376 - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 377 - io_alloc_cache_free(&ctx->uring_cache, kfree); 378 - io_alloc_cache_free(&ctx->msg_cache, kfree); 379 - io_futex_cache_free(ctx); 363 + io_free_alloc_caches(ctx); 380 364 kvfree(ctx->cancel_table.hbs); 381 365 xa_destroy(&ctx->io_bl_xa); 382 366 kfree(ctx); ··· 399 393 400 394 static void io_clean_op(struct io_kiocb *req) 401 395 { 402 - if (req->flags & REQ_F_BUFFER_SELECTED) { 403 - spin_lock(&req->ctx->completion_lock); 404 - io_kbuf_drop(req); 405 - spin_unlock(&req->ctx->completion_lock); 406 - } 396 + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) 397 + io_kbuf_drop_legacy(req); 407 398 408 399 if (req->flags & REQ_F_NEED_CLEANUP) { 409 400 const struct io_cold_def *def = &io_cold_defs[req->opcode]; ··· 545 542 io_queue_linked_timeout(link); 546 543 } 547 544 548 - static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) 545 + static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) 549 546 { 550 547 io_queue_iowq(req); 551 548 } ··· 902 899 * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires 903 900 * the submitter task context, IOPOLL protects with uring_lock. 904 901 */ 905 - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { 902 + if (ctx->lockless_cq) { 906 903 req->io_task_work.func = io_req_task_complete; 907 904 io_req_task_work_add(req); 908 905 return; ··· 1024 1021 return nxt; 1025 1022 } 1026 1023 1027 - static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) 1024 + static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) 1028 1025 { 1029 1026 if (!ctx) 1030 1027 return; ··· 1054 1051 io_task_work.node); 1055 1052 1056 1053 if (req->ctx != ctx) { 1057 - ctx_flush_and_put(ctx, &ts); 1054 + ctx_flush_and_put(ctx, ts); 1058 1055 ctx = req->ctx; 1059 1056 mutex_lock(&ctx->uring_lock); 1060 1057 percpu_ref_get(&ctx->refs); 1061 1058 } 1062 1059 INDIRECT_CALL_2(req->io_task_work.func, 1063 1060 io_poll_task_func, io_req_rw_complete, 1064 - req, &ts); 1061 + req, ts); 1065 1062 node = next; 1066 1063 (*count)++; 1067 1064 if (unlikely(need_resched())) { 1068 - ctx_flush_and_put(ctx, &ts); 1065 + ctx_flush_and_put(ctx, ts); 1069 1066 ctx = NULL; 1070 1067 cond_resched(); 1071 1068 } 1072 1069 } while (node && *count < max_entries); 1073 1070 1074 - ctx_flush_and_put(ctx, &ts); 1071 + ctx_flush_and_put(ctx, ts); 1075 1072 return node; 1076 1073 } 1077 1074 ··· 1160 1157 * We don't know how many reuqests is there in the link and whether 1161 1158 * they can even be queued lazily, fall back to non-lazy. 1162 1159 */ 1163 - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 1160 + if (req->flags & IO_REQ_LINK_FLAGS) 1164 1161 flags &= ~IOU_F_TWQ_LAZY_WAKE; 1165 1162 1166 1163 guard(rcu)(); ··· 1279 1276 } 1280 1277 1281 1278 static int __io_run_local_work_loop(struct llist_node **node, 1282 - struct io_tw_state *ts, 1279 + io_tw_token_t tw, 1283 1280 int events) 1284 1281 { 1285 1282 int ret = 0; ··· 1290 1287 io_task_work.node); 1291 1288 INDIRECT_CALL_2(req->io_task_work.func, 1292 1289 io_poll_task_func, io_req_rw_complete, 1293 - req, ts); 1290 + req, tw); 1294 1291 *node = next; 1295 1292 if (++ret >= events) 1296 1293 break; ··· 1299 1296 return ret; 1300 1297 } 1301 1298 1302 - static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, 1299 + static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, 1303 1300 int min_events, int max_events) 1304 1301 { 1305 1302 struct llist_node *node; ··· 1312 1309 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1313 1310 again: 1314 1311 min_events -= ret; 1315 - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); 1312 + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 1316 1313 if (ctx->retry_llist.first) 1317 1314 goto retry_done; 1318 1315 ··· 1321 1318 * running the pending items. 1322 1319 */ 1323 1320 node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 1324 - ret += __io_run_local_work_loop(&node, ts, max_events - ret); 1321 + ret += __io_run_local_work_loop(&node, tw, max_events - ret); 1325 1322 ctx->retry_llist.first = node; 1326 1323 loops++; 1327 1324 ··· 1343 1340 1344 1341 if (!io_local_work_pending(ctx)) 1345 1342 return 0; 1346 - return __io_run_local_work(ctx, &ts, min_events, 1343 + return __io_run_local_work(ctx, ts, min_events, 1347 1344 max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1348 1345 } 1349 1346 ··· 1354 1351 int ret; 1355 1352 1356 1353 mutex_lock(&ctx->uring_lock); 1357 - ret = __io_run_local_work(ctx, &ts, min_events, max_events); 1354 + ret = __io_run_local_work(ctx, ts, min_events, max_events); 1358 1355 mutex_unlock(&ctx->uring_lock); 1359 1356 return ret; 1360 1357 } 1361 1358 1362 - static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) 1359 + static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) 1363 1360 { 1364 - io_tw_lock(req->ctx, ts); 1361 + io_tw_lock(req->ctx, tw); 1365 1362 io_req_defer_failed(req, req->cqe.res); 1366 1363 } 1367 1364 1368 - void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) 1365 + void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) 1369 1366 { 1370 - io_tw_lock(req->ctx, ts); 1367 + io_tw_lock(req->ctx, tw); 1371 1368 if (unlikely(io_should_terminate_tw())) 1372 1369 io_req_defer_failed(req, -EFAULT); 1373 1370 else if (req->flags & REQ_F_FORCE_ASYNC) ··· 1422 1419 1423 1420 if (apoll->double_poll) 1424 1421 kfree(apoll->double_poll); 1425 - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) 1426 - kfree(apoll); 1422 + io_cache_free(&ctx->apoll_cache, apoll); 1427 1423 req->flags &= ~REQ_F_POLLED; 1428 1424 } 1429 1425 if (req->flags & IO_REQ_LINK_FLAGS) ··· 1584 1582 return 0; 1585 1583 } 1586 1584 1587 - void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) 1585 + void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) 1588 1586 { 1589 1587 io_req_complete_defer(req); 1590 1588 } ··· 1721 1719 return !!req->file; 1722 1720 } 1723 1721 1724 - static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 1722 + static inline int __io_issue_sqe(struct io_kiocb *req, 1723 + unsigned int issue_flags, 1724 + const struct io_issue_def *def) 1725 1725 { 1726 - const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1727 1726 const struct cred *creds = NULL; 1728 1727 int ret; 1729 - 1730 - if (unlikely(!io_assign_file(req, def, issue_flags))) 1731 - return -EBADF; 1732 1728 1733 1729 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) 1734 1730 creds = override_creds(req->creds); ··· 1741 1741 1742 1742 if (creds) 1743 1743 revert_creds(creds); 1744 + 1745 + return ret; 1746 + } 1747 + 1748 + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 1749 + { 1750 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1751 + int ret; 1752 + 1753 + if (unlikely(!io_assign_file(req, def, issue_flags))) 1754 + return -EBADF; 1755 + 1756 + ret = __io_issue_sqe(req, issue_flags, def); 1744 1757 1745 1758 if (ret == IOU_OK) { 1746 1759 if (issue_flags & IO_URING_F_COMPLETE_DEFER) ··· 1775 1762 return ret; 1776 1763 } 1777 1764 1778 - int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) 1765 + int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) 1779 1766 { 1780 - io_tw_lock(req->ctx, ts); 1781 - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| 1782 - IO_URING_F_COMPLETE_DEFER); 1767 + const unsigned int issue_flags = IO_URING_F_NONBLOCK | 1768 + IO_URING_F_MULTISHOT | 1769 + IO_URING_F_COMPLETE_DEFER; 1770 + int ret; 1771 + 1772 + io_tw_lock(req->ctx, tw); 1773 + 1774 + WARN_ON_ONCE(!req->file); 1775 + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) 1776 + return -EFAULT; 1777 + 1778 + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); 1779 + 1780 + WARN_ON_ONCE(ret == IOU_OK); 1781 + 1782 + if (ret == IOU_ISSUE_SKIP_COMPLETE) 1783 + ret = 0; 1784 + return ret; 1783 1785 } 1784 1786 1785 1787 struct io_wq_work *io_wq_free_work(struct io_wq_work *work) ··· 2024 1996 return true; 2025 1997 } 2026 1998 2027 - static void io_init_req_drain(struct io_kiocb *req) 1999 + static void io_init_drain(struct io_ring_ctx *ctx) 2028 2000 { 2029 - struct io_ring_ctx *ctx = req->ctx; 2030 2001 struct io_kiocb *head = ctx->submit_state.link.head; 2031 2002 2032 2003 ctx->drain_active = true; ··· 2089 2062 if (sqe_flags & IOSQE_IO_DRAIN) { 2090 2063 if (ctx->drain_disabled) 2091 2064 return io_init_fail_req(req, -EOPNOTSUPP); 2092 - io_init_req_drain(req); 2065 + io_init_drain(ctx); 2093 2066 } 2094 2067 } 2095 2068 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { ··· 2485 2458 return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; 2486 2459 } 2487 2460 2461 + struct ext_arg { 2462 + size_t argsz; 2463 + struct timespec64 ts; 2464 + const sigset_t __user *sig; 2465 + ktime_t min_time; 2466 + bool ts_set; 2467 + bool iowait; 2468 + }; 2469 + 2488 2470 static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2489 2471 struct io_wait_queue *iowq, 2472 + struct ext_arg *ext_arg, 2490 2473 ktime_t start_time) 2491 2474 { 2492 2475 int ret = 0; ··· 2506 2469 * can take into account that the task is waiting for IO - turns out 2507 2470 * to be important for low QD IO. 2508 2471 */ 2509 - if (current_pending_io()) 2472 + if (ext_arg->iowait && current_pending_io()) 2510 2473 current->in_iowait = 1; 2511 2474 if (iowq->timeout != KTIME_MAX || iowq->min_timeout) 2512 2475 ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); ··· 2519 2482 /* If this returns > 0, the caller should retry */ 2520 2483 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2521 2484 struct io_wait_queue *iowq, 2485 + struct ext_arg *ext_arg, 2522 2486 ktime_t start_time) 2523 2487 { 2524 2488 if (unlikely(READ_ONCE(ctx->check_cq))) ··· 2533 2495 if (unlikely(io_should_wake(iowq))) 2534 2496 return 0; 2535 2497 2536 - return __io_cqring_wait_schedule(ctx, iowq, start_time); 2498 + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); 2537 2499 } 2538 - 2539 - struct ext_arg { 2540 - size_t argsz; 2541 - struct timespec64 ts; 2542 - const sigset_t __user *sig; 2543 - ktime_t min_time; 2544 - bool ts_set; 2545 - }; 2546 2500 2547 2501 /* 2548 2502 * Wait until events become available, if we don't already have some. The ··· 2613 2583 TASK_INTERRUPTIBLE); 2614 2584 } 2615 2585 2616 - ret = io_cqring_wait_schedule(ctx, &iowq, start_time); 2586 + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); 2617 2587 __set_current_state(TASK_RUNNING); 2618 2588 atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 2619 2589 ··· 2734 2704 io_sqe_files_unregister(ctx); 2735 2705 io_cqring_overflow_kill(ctx); 2736 2706 io_eventfd_unregister(ctx); 2737 - io_alloc_cache_free(&ctx->apoll_cache, kfree); 2738 - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 2739 - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 2740 - io_alloc_cache_free(&ctx->uring_cache, kfree); 2741 - io_alloc_cache_free(&ctx->msg_cache, kfree); 2742 - io_futex_cache_free(ctx); 2707 + io_free_alloc_caches(ctx); 2743 2708 io_destroy_buffers(ctx); 2744 2709 io_free_region(ctx, &ctx->param_region); 2745 2710 mutex_unlock(&ctx->uring_lock); ··· 3264 3239 const struct io_uring_getevents_arg __user *uarg = argp; 3265 3240 struct io_uring_getevents_arg arg; 3266 3241 3242 + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); 3243 + 3267 3244 /* 3268 3245 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 3269 3246 * is just a pointer to the sigset_t. ··· 3343 3316 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 3344 3317 IORING_ENTER_REGISTERED_RING | 3345 3318 IORING_ENTER_ABS_TIMER | 3346 - IORING_ENTER_EXT_ARG_REG))) 3319 + IORING_ENTER_EXT_ARG_REG | 3320 + IORING_ENTER_NO_IOWAIT))) 3347 3321 return -EINVAL; 3348 3322 3349 3323 /* ··· 3565 3537 O_RDWR | O_CLOEXEC, NULL); 3566 3538 } 3567 3539 3540 + static int io_uring_sanitise_params(struct io_uring_params *p) 3541 + { 3542 + unsigned flags = p->flags; 3543 + 3544 + /* There is no way to mmap rings without a real fd */ 3545 + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && 3546 + !(flags & IORING_SETUP_NO_MMAP)) 3547 + return -EINVAL; 3548 + 3549 + if (flags & IORING_SETUP_SQPOLL) { 3550 + /* IPI related flags don't make sense with SQPOLL */ 3551 + if (flags & (IORING_SETUP_COOP_TASKRUN | 3552 + IORING_SETUP_TASKRUN_FLAG | 3553 + IORING_SETUP_DEFER_TASKRUN)) 3554 + return -EINVAL; 3555 + } 3556 + 3557 + if (flags & IORING_SETUP_TASKRUN_FLAG) { 3558 + if (!(flags & (IORING_SETUP_COOP_TASKRUN | 3559 + IORING_SETUP_DEFER_TASKRUN))) 3560 + return -EINVAL; 3561 + } 3562 + 3563 + /* HYBRID_IOPOLL only valid with IOPOLL */ 3564 + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) 3565 + return -EINVAL; 3566 + 3567 + /* 3568 + * For DEFER_TASKRUN we require the completion task to be the same as 3569 + * the submission task. This implies that there is only one submitter. 3570 + */ 3571 + if ((flags & IORING_SETUP_DEFER_TASKRUN) && 3572 + !(flags & IORING_SETUP_SINGLE_ISSUER)) 3573 + return -EINVAL; 3574 + 3575 + return 0; 3576 + } 3577 + 3568 3578 int io_uring_fill_params(unsigned entries, struct io_uring_params *p) 3569 3579 { 3570 3580 if (!entries) ··· 3612 3546 return -EINVAL; 3613 3547 entries = IORING_MAX_ENTRIES; 3614 3548 } 3615 - 3616 - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) 3617 - && !(p->flags & IORING_SETUP_NO_MMAP)) 3618 - return -EINVAL; 3619 3549 3620 3550 /* 3621 3551 * Use twice as many entries for the CQ ring. It's possible for the ··· 3674 3612 struct file *file; 3675 3613 int ret; 3676 3614 3615 + ret = io_uring_sanitise_params(p); 3616 + if (ret) 3617 + return ret; 3618 + 3677 3619 ret = io_uring_fill_params(entries, p); 3678 3620 if (unlikely(ret)) 3679 3621 return ret; ··· 3725 3659 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if 3726 3660 * COOP_TASKRUN is set, then IPIs are never needed by the app. 3727 3661 */ 3728 - ret = -EINVAL; 3729 - if (ctx->flags & IORING_SETUP_SQPOLL) { 3730 - /* IPI related flags don't make sense with SQPOLL */ 3731 - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | 3732 - IORING_SETUP_TASKRUN_FLAG | 3733 - IORING_SETUP_DEFER_TASKRUN)) 3734 - goto err; 3662 + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) 3735 3663 ctx->notify_method = TWA_SIGNAL_NO_IPI; 3736 - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { 3737 - ctx->notify_method = TWA_SIGNAL_NO_IPI; 3738 - } else { 3739 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && 3740 - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 3741 - goto err; 3664 + else 3742 3665 ctx->notify_method = TWA_SIGNAL; 3743 - } 3744 - 3745 - /* HYBRID_IOPOLL only valid with IOPOLL */ 3746 - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == 3747 - IORING_SETUP_HYBRID_IOPOLL) 3748 - goto err; 3749 - 3750 - /* 3751 - * For DEFER_TASKRUN we require the completion task to be the same as the 3752 - * submission task. This implies that there is only one submitter, so enforce 3753 - * that. 3754 - */ 3755 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && 3756 - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { 3757 - goto err; 3758 - } 3759 3666 3760 3667 /* 3761 3668 * This is just grabbed for accounting purposes. When a process exits, ··· 3758 3719 IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 3759 3720 IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | 3760 3721 IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | 3761 - IORING_FEAT_RW_ATTR; 3722 + IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; 3762 3723 3763 3724 if (copy_to_user(params, p, sizeof(*p))) { 3764 3725 ret = -EFAULT; ··· 3954 3915 3955 3916 io_uring_optable_init(); 3956 3917 3918 + /* imu->dir is u8 */ 3919 + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); 3920 + 3957 3921 /* 3958 3922 * Allow user copy in the per-command field, which starts after the 3959 3923 * file in io_kiocb and until the opcode field. The openat2 handling ··· 3967 3925 req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, 3968 3926 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | 3969 3927 SLAB_TYPESAFE_BY_RCU); 3970 - io_buf_cachep = KMEM_CACHE(io_buffer, 3971 - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); 3972 3928 3973 3929 iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); 3930 + BUG_ON(!iou_wq); 3974 3931 3975 3932 #ifdef CONFIG_SYSCTL 3976 3933 register_sysctl_init("kernel", kernel_io_uring_disabled_table);
+9 -6
io_uring/io_uring.h
··· 88 88 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 89 89 void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, 90 90 unsigned flags); 91 - bool io_alloc_async_data(struct io_kiocb *req); 92 91 void io_req_task_queue(struct io_kiocb *req); 93 - void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); 92 + void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); 94 93 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 95 - void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); 94 + void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); 96 95 struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 97 96 struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 98 97 void tctx_task_work(struct callback_head *cb); ··· 103 104 int start, int end); 104 105 void io_req_queue_iowq(struct io_kiocb *req); 105 106 106 - int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); 107 + int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); 107 108 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 108 109 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 109 110 void __io_submit_flush_completions(struct io_ring_ctx *ctx); ··· 144 145 lockdep_assert(current == ctx->submitter_task); 145 146 } 146 147 #endif 148 + } 149 + 150 + static inline bool io_is_compat(struct io_ring_ctx *ctx) 151 + { 152 + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); 147 153 } 148 154 149 155 static inline void io_req_task_work_add(struct io_kiocb *req) ··· 380 376 return task_work_pending(current) || io_local_work_pending(ctx); 381 377 } 382 378 383 - static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) 379 + static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) 384 380 { 385 381 lockdep_assert_held(&ctx->uring_lock); 386 382 } ··· 422 418 } 423 419 424 420 extern struct kmem_cache *req_cachep; 425 - extern struct kmem_cache *io_buf_cachep; 426 421 427 422 static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) 428 423 {
+88 -112
io_uring/kbuf.c
··· 20 20 /* BIDs are addressed by a 16-bit field in a CQE */ 21 21 #define MAX_BIDS_PER_BGID (1 << 16) 22 22 23 - struct kmem_cache *io_buf_cachep; 23 + /* Mapped buffer ring, return io_uring_buf from head */ 24 + #define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] 24 25 25 26 struct io_provide_buf { 26 27 struct file *file; ··· 31 30 __u32 nbufs; 32 31 __u16 bid; 33 32 }; 33 + 34 + static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) 35 + { 36 + while (len) { 37 + struct io_uring_buf *buf; 38 + u32 this_len; 39 + 40 + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); 41 + this_len = min_t(int, len, buf->len); 42 + buf->len -= this_len; 43 + if (buf->len) { 44 + buf->addr += this_len; 45 + return false; 46 + } 47 + bl->head++; 48 + len -= this_len; 49 + } 50 + return true; 51 + } 52 + 53 + bool io_kbuf_commit(struct io_kiocb *req, 54 + struct io_buffer_list *bl, int len, int nr) 55 + { 56 + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) 57 + return true; 58 + 59 + req->flags &= ~REQ_F_BUFFERS_COMMIT; 60 + 61 + if (unlikely(len < 0)) 62 + return true; 63 + if (bl->flags & IOBL_INC) 64 + return io_kbuf_inc_commit(bl, len); 65 + bl->head += nr; 66 + return true; 67 + } 34 68 35 69 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 36 70 unsigned int bgid) ··· 88 52 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 89 53 } 90 54 55 + void io_kbuf_drop_legacy(struct io_kiocb *req) 56 + { 57 + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) 58 + return; 59 + req->buf_index = req->kbuf->bgid; 60 + req->flags &= ~REQ_F_BUFFER_SELECTED; 61 + kfree(req->kbuf); 62 + req->kbuf = NULL; 63 + } 64 + 91 65 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 92 66 { 93 67 struct io_ring_ctx *ctx = req->ctx; ··· 114 68 115 69 io_ring_submit_unlock(ctx, issue_flags); 116 70 return true; 117 - } 118 - 119 - void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) 120 - { 121 - /* 122 - * We can add this buffer back to two lists: 123 - * 124 - * 1) The io_buffers_cache list. This one is protected by the 125 - * ctx->uring_lock. If we already hold this lock, add back to this 126 - * list as we can grab it from issue as well. 127 - * 2) The io_buffers_comp list. This one is protected by the 128 - * ctx->completion_lock. 129 - * 130 - * We migrate buffers from the comp_list to the issue cache list 131 - * when we need one. 132 - */ 133 - if (issue_flags & IO_URING_F_UNLOCKED) { 134 - struct io_ring_ctx *ctx = req->ctx; 135 - 136 - spin_lock(&ctx->completion_lock); 137 - __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); 138 - spin_unlock(&ctx->completion_lock); 139 - } else { 140 - lockdep_assert_held(&req->ctx->uring_lock); 141 - 142 - __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); 143 - } 144 71 } 145 72 146 73 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, ··· 233 214 buf = io_ring_head_to_buf(br, head, bl->mask); 234 215 if (arg->max_len) { 235 216 u32 len = READ_ONCE(buf->len); 217 + size_t needed; 236 218 237 219 if (unlikely(!len)) 238 220 return -ENOBUFS; 239 - /* 240 - * Limit incremental buffers to 1 segment. No point trying 241 - * to peek ahead and map more than we need, when the buffers 242 - * themselves should be large when setup with 243 - * IOU_PBUF_RING_INC. 244 - */ 245 - if (bl->flags & IOBL_INC) { 246 - nr_avail = 1; 247 - } else { 248 - size_t needed; 249 - 250 - needed = (arg->max_len + len - 1) / len; 251 - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); 252 - if (nr_avail > needed) 253 - nr_avail = needed; 254 - } 221 + needed = (arg->max_len + len - 1) / len; 222 + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); 223 + if (nr_avail > needed) 224 + nr_avail = needed; 255 225 } 256 226 257 227 /* ··· 350 342 return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); 351 343 } 352 344 345 + static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 346 + { 347 + struct io_buffer_list *bl = req->buf_list; 348 + bool ret = true; 349 + 350 + if (bl) { 351 + ret = io_kbuf_commit(req, bl, len, nr); 352 + req->buf_index = bl->bgid; 353 + } 354 + req->flags &= ~REQ_F_BUFFER_RING; 355 + return ret; 356 + } 357 + 358 + unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) 359 + { 360 + unsigned int ret; 361 + 362 + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 363 + 364 + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { 365 + io_kbuf_drop_legacy(req); 366 + return ret; 367 + } 368 + 369 + if (!__io_put_kbuf_ring(req, len, nbufs)) 370 + ret |= IORING_CQE_F_BUF_MORE; 371 + return ret; 372 + } 373 + 353 374 static int __io_remove_buffers(struct io_ring_ctx *ctx, 354 375 struct io_buffer_list *bl, unsigned nbufs) 355 376 { ··· 404 367 struct io_buffer *nxt; 405 368 406 369 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 407 - list_move(&nxt->list, &ctx->io_buffers_cache); 370 + list_del(&nxt->list); 371 + kfree(nxt); 372 + 408 373 if (++i == nbufs) 409 374 return i; 410 375 cond_resched(); ··· 424 385 void io_destroy_buffers(struct io_ring_ctx *ctx) 425 386 { 426 387 struct io_buffer_list *bl; 427 - struct list_head *item, *tmp; 428 - struct io_buffer *buf; 429 388 430 389 while (1) { 431 390 unsigned long index = 0; ··· 436 399 if (!bl) 437 400 break; 438 401 io_put_bl(ctx, bl); 439 - } 440 - 441 - /* 442 - * Move deferred locked entries to cache before pruning 443 - */ 444 - spin_lock(&ctx->completion_lock); 445 - if (!list_empty(&ctx->io_buffers_comp)) 446 - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); 447 - spin_unlock(&ctx->completion_lock); 448 - 449 - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { 450 - buf = list_entry(item, struct io_buffer, list); 451 - kmem_cache_free(io_buf_cachep, buf); 452 402 } 453 403 } 454 404 ··· 525 501 return 0; 526 502 } 527 503 528 - #define IO_BUFFER_ALLOC_BATCH 64 529 - 530 - static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 531 - { 532 - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; 533 - int allocated; 534 - 535 - /* 536 - * Completions that don't happen inline (eg not under uring_lock) will 537 - * add to ->io_buffers_comp. If we don't have any free buffers, check 538 - * the completion list and splice those entries first. 539 - */ 540 - if (!list_empty_careful(&ctx->io_buffers_comp)) { 541 - spin_lock(&ctx->completion_lock); 542 - if (!list_empty(&ctx->io_buffers_comp)) { 543 - list_splice_init(&ctx->io_buffers_comp, 544 - &ctx->io_buffers_cache); 545 - spin_unlock(&ctx->completion_lock); 546 - return 0; 547 - } 548 - spin_unlock(&ctx->completion_lock); 549 - } 550 - 551 - /* 552 - * No free buffers and no completion entries either. Allocate a new 553 - * batch of buffer entries and add those to our freelist. 554 - */ 555 - 556 - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, 557 - ARRAY_SIZE(bufs), (void **) bufs); 558 - if (unlikely(!allocated)) { 559 - /* 560 - * Bulk alloc is all-or-nothing. If we fail to get a batch, 561 - * retry single alloc to be on the safe side. 562 - */ 563 - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); 564 - if (!bufs[0]) 565 - return -ENOMEM; 566 - allocated = 1; 567 - } 568 - 569 - while (allocated) 570 - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); 571 - 572 - return 0; 573 - } 574 - 575 504 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 576 505 struct io_buffer_list *bl) 577 506 { ··· 533 556 int i, bid = pbuf->bid; 534 557 535 558 for (i = 0; i < pbuf->nbufs; i++) { 536 - if (list_empty(&ctx->io_buffers_cache) && 537 - io_refill_buffer_cache(ctx)) 559 + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 560 + if (!buf) 538 561 break; 539 - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 540 - list); 541 - list_move_tail(&buf->list, &bl->buf_list); 562 + 563 + list_add_tail(&buf->list, &bl->buf_list); 542 564 buf->addr = addr; 543 565 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 544 566 buf->bid = bid;
+11 -89
io_uring/kbuf.h
··· 74 74 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 75 75 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); 76 76 77 - void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); 78 - 79 77 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); 78 + void io_kbuf_drop_legacy(struct io_kiocb *req); 79 + 80 + unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); 81 + bool io_kbuf_commit(struct io_kiocb *req, 82 + struct io_buffer_list *bl, int len, int nr); 80 83 81 84 struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, 82 85 unsigned int bgid); ··· 119 116 return false; 120 117 } 121 118 122 - /* Mapped buffer ring, return io_uring_buf from head */ 123 - #define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] 124 - 125 - static inline bool io_kbuf_commit(struct io_kiocb *req, 126 - struct io_buffer_list *bl, int len, int nr) 127 - { 128 - if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) 129 - return true; 130 - 131 - req->flags &= ~REQ_F_BUFFERS_COMMIT; 132 - 133 - if (unlikely(len < 0)) 134 - return true; 135 - 136 - if (bl->flags & IOBL_INC) { 137 - struct io_uring_buf *buf; 138 - 139 - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); 140 - if (WARN_ON_ONCE(len > buf->len)) 141 - len = buf->len; 142 - buf->len -= len; 143 - if (buf->len) { 144 - buf->addr += len; 145 - return false; 146 - } 147 - } 148 - 149 - bl->head += nr; 150 - return true; 151 - } 152 - 153 - static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 154 - { 155 - struct io_buffer_list *bl = req->buf_list; 156 - bool ret = true; 157 - 158 - if (bl) { 159 - ret = io_kbuf_commit(req, bl, len, nr); 160 - req->buf_index = bl->bgid; 161 - } 162 - req->flags &= ~REQ_F_BUFFER_RING; 163 - return ret; 164 - } 165 - 166 - static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, 167 - struct list_head *list) 168 - { 169 - if (req->flags & REQ_F_BUFFER_RING) { 170 - __io_put_kbuf_ring(req, len, 1); 171 - } else { 172 - req->buf_index = req->kbuf->bgid; 173 - list_add(&req->kbuf->list, list); 174 - req->flags &= ~REQ_F_BUFFER_SELECTED; 175 - } 176 - } 177 - 178 - static inline void io_kbuf_drop(struct io_kiocb *req) 179 - { 180 - lockdep_assert_held(&req->ctx->completion_lock); 181 - 182 - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) 183 - return; 184 - 185 - /* len == 0 is fine here, non-ring will always drop all of it */ 186 - __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); 187 - } 188 - 189 - static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, 190 - int nbufs, unsigned issue_flags) 191 - { 192 - unsigned int ret; 193 - 194 - if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 195 - return 0; 196 - 197 - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 198 - if (req->flags & REQ_F_BUFFER_RING) { 199 - if (!__io_put_kbuf_ring(req, len, nbufs)) 200 - ret |= IORING_CQE_F_BUF_MORE; 201 - } else { 202 - __io_put_kbuf(req, len, issue_flags); 203 - } 204 - return ret; 205 - } 206 - 207 119 static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, 208 120 unsigned issue_flags) 209 121 { 210 - return __io_put_kbufs(req, len, 1, issue_flags); 122 + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 123 + return 0; 124 + return __io_put_kbufs(req, len, 1); 211 125 } 212 126 213 127 static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, 214 128 int nbufs, unsigned issue_flags) 215 129 { 216 - return __io_put_kbufs(req, len, nbufs, issue_flags); 130 + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 131 + return 0; 132 + return __io_put_kbufs(req, len, nbufs); 217 133 } 218 134 #endif
+1 -1
io_uring/msg_ring.c
··· 71 71 return target_ctx->task_complete; 72 72 } 73 73 74 - static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) 74 + static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) 75 75 { 76 76 struct io_ring_ctx *ctx = req->ctx; 77 77
+114 -143
io_uring/net.c
··· 75 75 u16 flags; 76 76 /* initialised and used only by !msg send variants */ 77 77 u16 buf_group; 78 - u16 buf_index; 78 + bool retry; 79 79 void __user *msg_control; 80 80 /* used only for send zerocopy */ 81 81 struct io_kiocb *notif; ··· 187 187 188 188 req->flags &= ~REQ_F_BL_EMPTY; 189 189 sr->done_io = 0; 190 + sr->retry = false; 190 191 sr->len = 0; /* get from the provided buffer */ 191 192 req->buf_index = sr->buf_group; 192 193 } 193 194 194 - #ifdef CONFIG_COMPAT 195 - static int io_compat_msg_copy_hdr(struct io_kiocb *req, 196 - struct io_async_msghdr *iomsg, 197 - struct compat_msghdr *msg, int ddir) 195 + static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, 196 + const struct iovec __user *uiov, unsigned uvec_seg, 197 + int ddir) 198 198 { 199 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 200 - struct compat_iovec __user *uiov; 201 199 struct iovec *iov; 202 200 int ret, nr_segs; 203 201 ··· 203 205 nr_segs = iomsg->free_iov_nr; 204 206 iov = iomsg->free_iov; 205 207 } else { 206 - iov = &iomsg->fast_iov; 207 208 nr_segs = 1; 209 + iov = &iomsg->fast_iov; 208 210 } 211 + 212 + ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, 213 + &iomsg->msg.msg_iter, io_is_compat(req->ctx)); 214 + if (unlikely(ret < 0)) 215 + return ret; 216 + io_net_vec_assign(req, iomsg, iov); 217 + return 0; 218 + } 219 + 220 + static int io_compat_msg_copy_hdr(struct io_kiocb *req, 221 + struct io_async_msghdr *iomsg, 222 + struct compat_msghdr *msg, int ddir, 223 + struct sockaddr __user **save_addr) 224 + { 225 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 226 + struct compat_iovec __user *uiov; 227 + int ret; 209 228 210 229 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 211 230 return -EFAULT; 212 231 232 + ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); 233 + if (ret) 234 + return ret; 235 + 213 236 uiov = compat_ptr(msg->msg_iov); 214 237 if (req->flags & REQ_F_BUFFER_SELECT) { 215 - compat_ssize_t clen; 216 - 217 238 if (msg->msg_iovlen == 0) { 218 - sr->len = iov->iov_len = 0; 219 - iov->iov_base = NULL; 239 + sr->len = 0; 220 240 } else if (msg->msg_iovlen > 1) { 221 241 return -EINVAL; 222 242 } else { 223 - if (!access_ok(uiov, sizeof(*uiov))) 243 + struct compat_iovec tmp_iov; 244 + 245 + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 224 246 return -EFAULT; 225 - if (__get_user(clen, &uiov->iov_len)) 226 - return -EFAULT; 227 - if (clen < 0) 228 - return -EINVAL; 229 - sr->len = clen; 247 + sr->len = tmp_iov.iov_len; 230 248 } 231 249 232 250 return 0; 233 251 } 234 252 235 - ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, 236 - nr_segs, &iov, &iomsg->msg.msg_iter, true); 237 - if (unlikely(ret < 0)) 238 - return ret; 239 - 240 - io_net_vec_assign(req, iomsg, iov); 241 - return 0; 253 + return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, 254 + msg->msg_iovlen, ddir); 242 255 } 243 - #endif 244 256 245 - static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 246 - struct user_msghdr *msg, int ddir) 257 + static int io_copy_msghdr_from_user(struct user_msghdr *msg, 258 + struct user_msghdr __user *umsg) 247 259 { 248 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 249 - struct user_msghdr __user *umsg = sr->umsg; 250 - struct iovec *iov; 251 - int ret, nr_segs; 252 - 253 - if (iomsg->free_iov) { 254 - nr_segs = iomsg->free_iov_nr; 255 - iov = iomsg->free_iov; 256 - } else { 257 - iov = &iomsg->fast_iov; 258 - nr_segs = 1; 259 - } 260 - 261 260 if (!user_access_begin(umsg, sizeof(*umsg))) 262 261 return -EFAULT; 263 - 264 - ret = -EFAULT; 265 262 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 266 263 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 267 264 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 268 265 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 269 266 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 270 267 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 268 + user_access_end(); 269 + return 0; 270 + ua_end: 271 + user_access_end(); 272 + return -EFAULT; 273 + } 274 + 275 + static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 276 + struct user_msghdr *msg, int ddir, 277 + struct sockaddr __user **save_addr) 278 + { 279 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 280 + struct user_msghdr __user *umsg = sr->umsg; 281 + int ret; 282 + 283 + ret = io_copy_msghdr_from_user(msg, umsg); 284 + if (unlikely(ret)) 285 + return ret; 286 + 271 287 msg->msg_flags = 0; 288 + 289 + ret = __copy_msghdr(&iomsg->msg, msg, save_addr); 290 + if (ret) 291 + return ret; 272 292 273 293 if (req->flags & REQ_F_BUFFER_SELECT) { 274 294 if (msg->msg_iovlen == 0) { 275 - sr->len = iov->iov_len = 0; 276 - iov->iov_base = NULL; 295 + sr->len = 0; 277 296 } else if (msg->msg_iovlen > 1) { 278 - ret = -EINVAL; 279 - goto ua_end; 297 + return -EINVAL; 280 298 } else { 281 299 struct iovec __user *uiov = msg->msg_iov; 300 + struct iovec tmp_iov; 282 301 283 - /* we only need the length for provided buffers */ 284 - if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) 285 - goto ua_end; 286 - unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); 287 - sr->len = iov->iov_len; 302 + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 303 + return -EFAULT; 304 + sr->len = tmp_iov.iov_len; 288 305 } 289 - ret = 0; 290 - ua_end: 291 - user_access_end(); 292 - return ret; 306 + return 0; 293 307 } 294 308 295 - user_access_end(); 296 - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, 297 - &iov, &iomsg->msg.msg_iter, false); 298 - if (unlikely(ret < 0)) 299 - return ret; 300 - 301 - io_net_vec_assign(req, iomsg, iov); 302 - return 0; 309 + return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir); 303 310 } 304 311 305 312 static int io_sendmsg_copy_hdr(struct io_kiocb *req, ··· 317 314 iomsg->msg.msg_name = &iomsg->addr; 318 315 iomsg->msg.msg_iter.nr_segs = 0; 319 316 320 - #ifdef CONFIG_COMPAT 321 - if (unlikely(req->ctx->compat)) { 317 + if (io_is_compat(req->ctx)) { 322 318 struct compat_msghdr cmsg; 323 319 324 - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); 325 - if (unlikely(ret)) 326 - return ret; 327 - 328 - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); 320 + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE, 321 + NULL); 329 322 sr->msg_control = iomsg->msg.msg_control_user; 330 323 return ret; 331 324 } 332 - #endif 333 325 334 - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); 335 - if (unlikely(ret)) 336 - return ret; 337 - 338 - ret = __copy_msghdr(&iomsg->msg, &msg, NULL); 339 - 326 + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); 340 327 /* save msg_control as sys_sendmsg() overwrites it */ 341 328 sr->msg_control = iomsg->msg.msg_control_user; 342 329 return ret; ··· 380 387 { 381 388 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 382 389 struct io_async_msghdr *kmsg = req->async_data; 383 - int ret; 384 390 385 391 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 386 392 387 - ret = io_sendmsg_copy_hdr(req, kmsg); 388 - if (!ret) 389 - req->flags |= REQ_F_NEED_CLEANUP; 390 - return ret; 393 + return io_sendmsg_copy_hdr(req, kmsg); 391 394 } 392 395 393 396 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) ··· 393 404 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 394 405 395 406 sr->done_io = 0; 407 + sr->retry = false; 396 408 397 409 if (req->opcode != IORING_OP_SEND) { 398 410 if (sqe->addr2 || sqe->file_index) ··· 417 427 req->buf_list = NULL; 418 428 } 419 429 420 - #ifdef CONFIG_COMPAT 421 - if (req->ctx->compat) 430 + if (io_is_compat(req->ctx)) 422 431 sr->msg_flags |= MSG_CMSG_COMPAT; 423 - #endif 432 + 424 433 if (unlikely(!io_msg_alloc_async(req))) 425 434 return -ENOMEM; 426 435 if (req->opcode != IORING_OP_SENDMSG) ··· 703 714 iomsg->msg.msg_name = &iomsg->addr; 704 715 iomsg->msg.msg_iter.nr_segs = 0; 705 716 706 - #ifdef CONFIG_COMPAT 707 - if (unlikely(req->ctx->compat)) { 717 + if (io_is_compat(req->ctx)) { 708 718 struct compat_msghdr cmsg; 709 719 710 - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); 711 - if (unlikely(ret)) 712 - return ret; 713 - 714 - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); 715 - if (unlikely(ret)) 716 - return ret; 717 - 718 - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, 719 - cmsg.msg_controllen); 720 + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, 721 + &iomsg->uaddr); 722 + memset(&msg, 0, sizeof(msg)); 723 + msg.msg_namelen = cmsg.msg_namelen; 724 + msg.msg_controllen = cmsg.msg_controllen; 725 + } else { 726 + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); 720 727 } 721 - #endif 722 728 723 - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); 724 729 if (unlikely(ret)) 725 730 return ret; 726 - 727 - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); 728 - if (unlikely(ret)) 729 - return ret; 730 - 731 731 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 732 732 msg.msg_controllen); 733 733 } ··· 750 772 return 0; 751 773 } 752 774 753 - ret = io_recvmsg_copy_hdr(req, kmsg); 754 - if (!ret) 755 - req->flags |= REQ_F_NEED_CLEANUP; 756 - return ret; 775 + return io_recvmsg_copy_hdr(req, kmsg); 757 776 } 758 777 759 778 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ ··· 761 786 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 762 787 763 788 sr->done_io = 0; 789 + sr->retry = false; 764 790 765 791 if (unlikely(sqe->file_index || sqe->addr2)) 766 792 return -EINVAL; ··· 802 826 return -EINVAL; 803 827 } 804 828 805 - #ifdef CONFIG_COMPAT 806 - if (req->ctx->compat) 829 + if (io_is_compat(req->ctx)) 807 830 sr->msg_flags |= MSG_CMSG_COMPAT; 808 - #endif 831 + 809 832 sr->nr_multishot_loops = 0; 810 833 return io_recvmsg_prep_setup(req); 811 834 } 835 + 836 + /* bits to clear in old and inherit in new cflags on bundle retry */ 837 + #define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) 812 838 813 839 /* 814 840 * Finishes io_recv and io_recvmsg. ··· 831 853 if (sr->flags & IORING_RECVSEND_BUNDLE) { 832 854 cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), 833 855 issue_flags); 856 + if (sr->retry) 857 + cflags = req->cqe.flags | (cflags & CQE_F_MASK); 834 858 /* bundle with no more immediate buffers, we're done */ 835 859 if (req->flags & REQ_F_BL_EMPTY) 836 860 goto finish; 861 + /* if more is available, retry and append to this one */ 862 + if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { 863 + req->cqe.flags = cflags & ~CQE_F_MASK; 864 + sr->len = kmsg->msg.msg_inq; 865 + sr->done_io += *ret; 866 + sr->retry = true; 867 + return false; 868 + } 837 869 } else { 838 870 cflags |= io_put_kbuf(req, *ret, issue_flags); 839 871 } ··· 1222 1234 struct io_kiocb *notif; 1223 1235 1224 1236 zc->done_io = 0; 1237 + zc->retry = false; 1225 1238 req->flags |= REQ_F_POLL_NO_LAZY; 1226 1239 1227 1240 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) ··· 1261 1272 1262 1273 zc->len = READ_ONCE(sqe->len); 1263 1274 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1264 - zc->buf_index = READ_ONCE(sqe->buf_index); 1275 + req->buf_index = READ_ONCE(sqe->buf_index); 1265 1276 if (zc->msg_flags & MSG_DONTWAIT) 1266 1277 req->flags |= REQ_F_NOWAIT; 1267 1278 1268 - #ifdef CONFIG_COMPAT 1269 - if (req->ctx->compat) 1279 + if (io_is_compat(req->ctx)) 1270 1280 zc->msg_flags |= MSG_CMSG_COMPAT; 1271 - #endif 1281 + 1272 1282 if (unlikely(!io_msg_alloc_async(req))) 1273 1283 return -ENOMEM; 1274 1284 if (req->opcode != IORING_OP_SENDMSG_ZC) ··· 1332 1344 int ret; 1333 1345 1334 1346 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 1335 - struct io_ring_ctx *ctx = req->ctx; 1336 - struct io_rsrc_node *node; 1337 - 1338 - ret = -EFAULT; 1339 - io_ring_submit_lock(ctx, issue_flags); 1340 - node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); 1341 - if (node) { 1342 - io_req_assign_buf_node(sr->notif, node); 1343 - ret = 0; 1344 - } 1345 - io_ring_submit_unlock(ctx, issue_flags); 1346 - 1347 - if (unlikely(ret)) 1348 - return ret; 1349 - 1350 - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, 1351 - node->buf, (u64)(uintptr_t)sr->buf, 1352 - sr->len); 1347 + sr->notif->buf_index = req->buf_index; 1348 + ret = io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1349 + (u64)(uintptr_t)sr->buf, sr->len, 1350 + ITER_SOURCE, issue_flags); 1353 1351 if (unlikely(ret)) 1354 1352 return ret; 1355 1353 kmsg->msg.sg_from_iter = io_sg_from_iter; ··· 1574 1600 } 1575 1601 if (ret == -ERESTARTSYS) 1576 1602 ret = -EINTR; 1577 - req_set_fail(req); 1578 1603 } else if (!fixed) { 1579 1604 fd_install(fd, file); 1580 1605 ret = fd; ··· 1586 1613 if (!arg.is_empty) 1587 1614 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1588 1615 1589 - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1590 - io_req_set_res(req, ret, cflags); 1591 - return IOU_OK; 1592 - } 1593 - 1594 - if (ret < 0) 1595 - return ret; 1596 - if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1616 + if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && 1617 + io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1597 1618 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1598 1619 goto retry; 1599 1620 if (issue_flags & IO_URING_F_MULTISHOT) ··· 1596 1629 } 1597 1630 1598 1631 io_req_set_res(req, ret, cflags); 1632 + if (ret < 0) 1633 + req_set_fail(req); 1634 + if (!(issue_flags & IO_URING_F_MULTISHOT)) 1635 + return IOU_OK; 1599 1636 return IOU_STOP_MULTISHOT; 1600 1637 } 1601 1638
+3 -15
io_uring/nop.c
··· 16 16 struct file *file; 17 17 int result; 18 18 int fd; 19 - int buffer; 20 19 unsigned int flags; 21 20 }; 22 21 ··· 39 40 else 40 41 nop->fd = -1; 41 42 if (nop->flags & IORING_NOP_FIXED_BUFFER) 42 - nop->buffer = READ_ONCE(sqe->buf_index); 43 - else 44 - nop->buffer = -1; 43 + req->buf_index = READ_ONCE(sqe->buf_index); 45 44 return 0; 46 45 } 47 46 ··· 61 64 } 62 65 } 63 66 if (nop->flags & IORING_NOP_FIXED_BUFFER) { 64 - struct io_ring_ctx *ctx = req->ctx; 65 - struct io_rsrc_node *node; 66 - 67 - ret = -EFAULT; 68 - io_ring_submit_lock(ctx, issue_flags); 69 - node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); 70 - if (node) { 71 - io_req_assign_buf_node(req, node); 72 - ret = 0; 73 - } 74 - io_ring_submit_unlock(ctx, issue_flags); 67 + if (!io_find_buf_node(req, issue_flags)) 68 + ret = -EFAULT; 75 69 } 76 70 done: 77 71 if (ret < 0)
+2 -2
io_uring/notif.c
··· 11 11 12 12 static const struct ubuf_info_ops io_ubuf_ops; 13 13 14 - static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) 14 + static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) 15 15 { 16 16 struct io_notif_data *nd = io_notif_to_data(notif); 17 17 ··· 29 29 } 30 30 31 31 nd = nd->next; 32 - io_req_task_complete(notif, ts); 32 + io_req_task_complete(notif, tw); 33 33 } while (nd); 34 34 } 35 35
+2 -2
io_uring/opdef.c
··· 104 104 .iopoll_queue = 1, 105 105 .async_size = sizeof(struct io_async_rw), 106 106 .prep = io_prep_read_fixed, 107 - .issue = io_read, 107 + .issue = io_read_fixed, 108 108 }, 109 109 [IORING_OP_WRITE_FIXED] = { 110 110 .needs_file = 1, ··· 118 118 .iopoll_queue = 1, 119 119 .async_size = sizeof(struct io_async_rw), 120 120 .prep = io_prep_write_fixed, 121 - .issue = io_write, 121 + .issue = io_write_fixed, 122 122 }, 123 123 [IORING_OP_POLL_ADD] = { 124 124 .needs_file = 1,
+6 -6
io_uring/opdef.h
··· 7 7 unsigned needs_file : 1; 8 8 /* should block plug */ 9 9 unsigned plug : 1; 10 + /* supports ioprio */ 11 + unsigned ioprio : 1; 12 + /* supports iopoll */ 13 + unsigned iopoll : 1; 14 + /* op supports buffer selection */ 15 + unsigned buffer_select : 1; 10 16 /* hash wq insertion if file is a regular file */ 11 17 unsigned hash_reg_file : 1; 12 18 /* unbound wq insertion if file is a non-regular file */ ··· 21 15 unsigned pollin : 1; 22 16 unsigned pollout : 1; 23 17 unsigned poll_exclusive : 1; 24 - /* op supports buffer selection */ 25 - unsigned buffer_select : 1; 26 18 /* skip auditing */ 27 19 unsigned audit_skip : 1; 28 - /* supports ioprio */ 29 - unsigned ioprio : 1; 30 - /* supports iopoll */ 31 - unsigned iopoll : 1; 32 20 /* have to be put into the iopoll list */ 33 21 unsigned iopoll_queue : 1; 34 22 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
+9 -9
io_uring/poll.c
··· 220 220 * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot 221 221 * poll and that the result is stored in req->cqe. 222 222 */ 223 - static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) 223 + static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) 224 224 { 225 225 int v; 226 226 ··· 288 288 return IOU_POLL_REMOVE_POLL_USE_RES; 289 289 } 290 290 } else { 291 - int ret = io_poll_issue(req, ts); 291 + int ret = io_poll_issue(req, tw); 292 292 if (ret == IOU_STOP_MULTISHOT) 293 293 return IOU_POLL_REMOVE_POLL_USE_RES; 294 294 else if (ret == IOU_REQUEUE) ··· 311 311 return IOU_POLL_NO_ACTION; 312 312 } 313 313 314 - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) 314 + void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) 315 315 { 316 316 int ret; 317 317 318 - ret = io_poll_check_events(req, ts); 318 + ret = io_poll_check_events(req, tw); 319 319 if (ret == IOU_POLL_NO_ACTION) { 320 320 io_kbuf_recycle(req, 0); 321 321 return; ··· 335 335 poll = io_kiocb_to_cmd(req, struct io_poll); 336 336 req->cqe.res = mangle_poll(req->cqe.res & poll->events); 337 337 } else if (ret == IOU_POLL_REISSUE) { 338 - io_req_task_submit(req, ts); 338 + io_req_task_submit(req, tw); 339 339 return; 340 340 } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { 341 341 req->cqe.res = ret; ··· 343 343 } 344 344 345 345 io_req_set_res(req, req->cqe.res, 0); 346 - io_req_task_complete(req, ts); 346 + io_req_task_complete(req, tw); 347 347 } else { 348 - io_tw_lock(req->ctx, ts); 348 + io_tw_lock(req->ctx, tw); 349 349 350 350 if (ret == IOU_POLL_REMOVE_POLL_USE_RES) 351 - io_req_task_complete(req, ts); 351 + io_req_task_complete(req, tw); 352 352 else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) 353 - io_req_task_submit(req, ts); 353 + io_req_task_submit(req, tw); 354 354 else 355 355 io_req_defer_failed(req, ret); 356 356 }
+3 -1
io_uring/poll.h
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + #include <linux/io_uring_types.h> 4 + 3 5 #define IO_POLL_ALLOC_CACHE_MAX 32 4 6 5 7 enum { ··· 45 43 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 46 44 bool cancel_all); 47 45 48 - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); 46 + void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw);
+216 -34
io_uring/rsrc.c
··· 9 9 #include <linux/hugetlb.h> 10 10 #include <linux/compat.h> 11 11 #include <linux/io_uring.h> 12 + #include <linux/io_uring/cmd.h> 12 13 13 14 #include <uapi/linux/io_uring.h> 14 15 ··· 32 31 /* only define max */ 33 32 #define IORING_MAX_FIXED_FILES (1U << 20) 34 33 #define IORING_MAX_REG_BUFFERS (1U << 14) 34 + 35 + #define IO_CACHED_BVECS_SEGS 32 35 36 36 37 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 37 38 { ··· 104 101 return 0; 105 102 } 106 103 107 - static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 104 + static void io_release_ubuf(void *priv) 108 105 { 106 + struct io_mapped_ubuf *imu = priv; 109 107 unsigned int i; 110 108 111 - if (node->buf) { 112 - struct io_mapped_ubuf *imu = node->buf; 113 - 114 - if (!refcount_dec_and_test(&imu->refs)) 115 - return; 116 - for (i = 0; i < imu->nr_bvecs; i++) 117 - unpin_user_page(imu->bvec[i].bv_page); 118 - if (imu->acct_pages) 119 - io_unaccount_mem(ctx, imu->acct_pages); 120 - kvfree(imu); 121 - } 109 + for (i = 0; i < imu->nr_bvecs; i++) 110 + unpin_user_page(imu->bvec[i].bv_page); 122 111 } 123 112 124 - struct io_rsrc_node *io_rsrc_node_alloc(int type) 113 + static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 114 + int nr_bvecs) 115 + { 116 + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 117 + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 118 + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 119 + GFP_KERNEL); 120 + } 121 + 122 + static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 123 + { 124 + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 125 + io_cache_free(&ctx->imu_cache, imu); 126 + else 127 + kvfree(imu); 128 + } 129 + 130 + static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 131 + { 132 + if (!refcount_dec_and_test(&imu->refs)) 133 + return; 134 + 135 + if (imu->acct_pages) 136 + io_unaccount_mem(ctx, imu->acct_pages); 137 + imu->release(imu->priv); 138 + io_free_imu(ctx, imu); 139 + } 140 + 141 + struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 125 142 { 126 143 struct io_rsrc_node *node; 127 144 128 - node = kzalloc(sizeof(*node), GFP_KERNEL); 145 + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 129 146 if (node) { 130 147 node->type = type; 131 148 node->refs = 1; 149 + node->tag = 0; 150 + node->file_ptr = 0; 132 151 } 133 152 return node; 134 153 } 135 154 136 - __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) 155 + bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 156 + { 157 + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 158 + IO_CACHED_BVECS_SEGS); 159 + const int node_size = sizeof(struct io_rsrc_node); 160 + bool ret; 161 + 162 + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 163 + node_size, 0); 164 + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 165 + imu_cache_size, 0); 166 + return ret; 167 + } 168 + 169 + void io_rsrc_cache_free(struct io_ring_ctx *ctx) 170 + { 171 + io_alloc_cache_free(&ctx->node_cache, kfree); 172 + io_alloc_cache_free(&ctx->imu_cache, kfree); 173 + } 174 + 175 + __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 176 + struct io_rsrc_data *data) 137 177 { 138 178 if (!data->nr) 139 179 return; ··· 249 203 err = -EBADF; 250 204 break; 251 205 } 252 - node = io_rsrc_node_alloc(IORING_RSRC_FILE); 206 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 253 207 if (!node) { 254 208 err = -ENOMEM; 255 209 fput(file); ··· 495 449 496 450 switch (node->type) { 497 451 case IORING_RSRC_FILE: 498 - if (io_slot_file(node)) 499 - fput(io_slot_file(node)); 452 + fput(io_slot_file(node)); 500 453 break; 501 454 case IORING_RSRC_BUFFER: 502 - if (node->buf) 503 - io_buffer_unmap(ctx, node); 455 + io_buffer_unmap(ctx, node->buf); 504 456 break; 505 457 default: 506 458 WARN_ON_ONCE(1); 507 459 break; 508 460 } 509 461 510 - kfree(node); 462 + io_cache_free(&ctx->node_cache, node); 511 463 } 512 464 513 465 int io_sqe_files_unregister(struct io_ring_ctx *ctx) ··· 567 523 goto fail; 568 524 } 569 525 ret = -ENOMEM; 570 - node = io_rsrc_node_alloc(IORING_RSRC_FILE); 526 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 571 527 if (!node) { 572 528 fput(file); 573 529 goto fail; ··· 772 728 if (!iov->iov_base) 773 729 return NULL; 774 730 775 - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); 731 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 776 732 if (!node) 777 733 return ERR_PTR(-ENOMEM); 778 - node->buf = NULL; 779 734 780 735 ret = -ENOMEM; 781 736 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, ··· 791 748 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 792 749 } 793 750 794 - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 751 + imu = io_alloc_imu(ctx, nr_pages); 795 752 if (!imu) 796 753 goto done; 797 754 755 + imu->nr_bvecs = nr_pages; 798 756 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 799 757 if (ret) { 800 758 unpin_user_pages(pages, nr_pages); ··· 806 762 /* store original address for later verification */ 807 763 imu->ubuf = (unsigned long) iov->iov_base; 808 764 imu->len = iov->iov_len; 809 - imu->nr_bvecs = nr_pages; 810 765 imu->folio_shift = PAGE_SHIFT; 766 + imu->release = io_release_ubuf; 767 + imu->priv = imu; 768 + imu->is_kbuf = false; 769 + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 811 770 if (coalesced) 812 771 imu->folio_shift = data.folio_shift; 813 772 refcount_set(&imu->refs, 1); ··· 828 781 } 829 782 done: 830 783 if (ret) { 831 - kvfree(imu); 832 - if (node) 833 - io_put_rsrc_node(ctx, node); 784 + if (imu) 785 + io_free_imu(ctx, imu); 786 + io_cache_free(&ctx->node_cache, node); 834 787 node = ERR_PTR(ret); 835 788 } 836 789 kvfree(pages); ··· 907 860 return ret; 908 861 } 909 862 910 - int io_import_fixed(int ddir, struct iov_iter *iter, 863 + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 864 + void (*release)(void *), unsigned int index, 865 + unsigned int issue_flags) 866 + { 867 + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 868 + struct io_rsrc_data *data = &ctx->buf_table; 869 + struct req_iterator rq_iter; 870 + struct io_mapped_ubuf *imu; 871 + struct io_rsrc_node *node; 872 + struct bio_vec bv, *bvec; 873 + u16 nr_bvecs; 874 + int ret = 0; 875 + 876 + io_ring_submit_lock(ctx, issue_flags); 877 + if (index >= data->nr) { 878 + ret = -EINVAL; 879 + goto unlock; 880 + } 881 + index = array_index_nospec(index, data->nr); 882 + 883 + if (data->nodes[index]) { 884 + ret = -EBUSY; 885 + goto unlock; 886 + } 887 + 888 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 889 + if (!node) { 890 + ret = -ENOMEM; 891 + goto unlock; 892 + } 893 + 894 + nr_bvecs = blk_rq_nr_phys_segments(rq); 895 + imu = io_alloc_imu(ctx, nr_bvecs); 896 + if (!imu) { 897 + kfree(node); 898 + ret = -ENOMEM; 899 + goto unlock; 900 + } 901 + 902 + imu->ubuf = 0; 903 + imu->len = blk_rq_bytes(rq); 904 + imu->acct_pages = 0; 905 + imu->folio_shift = PAGE_SHIFT; 906 + imu->nr_bvecs = nr_bvecs; 907 + refcount_set(&imu->refs, 1); 908 + imu->release = release; 909 + imu->priv = rq; 910 + imu->is_kbuf = true; 911 + imu->dir = 1 << rq_data_dir(rq); 912 + 913 + bvec = imu->bvec; 914 + rq_for_each_bvec(bv, rq, rq_iter) 915 + *bvec++ = bv; 916 + 917 + node->buf = imu; 918 + data->nodes[index] = node; 919 + unlock: 920 + io_ring_submit_unlock(ctx, issue_flags); 921 + return ret; 922 + } 923 + EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 924 + 925 + int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 926 + unsigned int issue_flags) 927 + { 928 + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 929 + struct io_rsrc_data *data = &ctx->buf_table; 930 + struct io_rsrc_node *node; 931 + int ret = 0; 932 + 933 + io_ring_submit_lock(ctx, issue_flags); 934 + if (index >= data->nr) { 935 + ret = -EINVAL; 936 + goto unlock; 937 + } 938 + index = array_index_nospec(index, data->nr); 939 + 940 + node = data->nodes[index]; 941 + if (!node) { 942 + ret = -EINVAL; 943 + goto unlock; 944 + } 945 + if (!node->buf->is_kbuf) { 946 + ret = -EBUSY; 947 + goto unlock; 948 + } 949 + 950 + io_put_rsrc_node(ctx, node); 951 + data->nodes[index] = NULL; 952 + unlock: 953 + io_ring_submit_unlock(ctx, issue_flags); 954 + return ret; 955 + } 956 + EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 957 + 958 + static int io_import_fixed(int ddir, struct iov_iter *iter, 911 959 struct io_mapped_ubuf *imu, 912 960 u64 buf_addr, size_t len) 913 961 { ··· 1016 874 /* not inside the mapped region */ 1017 875 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1018 876 return -EFAULT; 877 + if (!(imu->dir & (1 << ddir))) 878 + return -EFAULT; 1019 879 1020 880 /* 1021 881 * Might not be a start of buffer, set size appropriately 1022 882 * and advance us to the beginning. 1023 883 */ 1024 884 offset = buf_addr - imu->ubuf; 1025 - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); 885 + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1026 886 1027 887 if (offset) { 1028 888 /* 1029 889 * Don't use iov_iter_advance() here, as it's really slow for 1030 890 * using the latter parts of a big fixed buffer - it iterates 1031 - * over each segment manually. We can cheat a bit here, because 1032 - * we know that: 891 + * over each segment manually. We can cheat a bit here for user 892 + * registered nodes, because we know that: 1033 893 * 1034 894 * 1) it's a BVEC iter, we set it up 1035 895 * 2) all bvecs are the same in size, except potentially the ··· 1045 901 */ 1046 902 const struct bio_vec *bvec = imu->bvec; 1047 903 904 + /* 905 + * Kernel buffer bvecs, on the other hand, don't necessarily 906 + * have the size property of user registered ones, so we have 907 + * to use the slow iter advance. 908 + */ 1048 909 if (offset < bvec->bv_len) { 910 + iter->count -= offset; 1049 911 iter->iov_offset = offset; 912 + } else if (imu->is_kbuf) { 913 + iov_iter_advance(iter, offset); 1050 914 } else { 1051 915 unsigned long seg_skip; 1052 916 ··· 1064 912 1065 913 iter->bvec += seg_skip; 1066 914 iter->nr_segs -= seg_skip; 915 + iter->count -= bvec->bv_len + offset; 1067 916 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 1068 917 } 1069 918 } 1070 919 1071 920 return 0; 921 + } 922 + 923 + inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 924 + unsigned issue_flags) 925 + { 926 + struct io_ring_ctx *ctx = req->ctx; 927 + struct io_rsrc_node *node; 928 + 929 + if (req->flags & REQ_F_BUF_NODE) 930 + return req->buf_node; 931 + 932 + io_ring_submit_lock(ctx, issue_flags); 933 + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 934 + if (node) 935 + io_req_assign_buf_node(req, node); 936 + io_ring_submit_unlock(ctx, issue_flags); 937 + return node; 938 + } 939 + 940 + int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 941 + u64 buf_addr, size_t len, int ddir, 942 + unsigned issue_flags) 943 + { 944 + struct io_rsrc_node *node; 945 + 946 + node = io_find_buf_node(req, issue_flags); 947 + if (!node) 948 + return -EFAULT; 949 + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1072 950 } 1073 951 1074 952 /* Lock two rings at once. The rings must be different! */ ··· 1184 1002 if (!src_node) { 1185 1003 dst_node = NULL; 1186 1004 } else { 1187 - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); 1005 + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1188 1006 if (!dst_node) { 1189 1007 ret = -ENOMEM; 1190 1008 goto out_free;
+19 -5
io_uring/rsrc.h
··· 2 2 #ifndef IOU_RSRC_H 3 3 #define IOU_RSRC_H 4 4 5 + #include <linux/io_uring_types.h> 5 6 #include <linux/lockdep.h> 6 7 7 8 enum { ··· 21 20 }; 22 21 }; 23 22 23 + enum { 24 + IO_IMU_DEST = 1 << ITER_DEST, 25 + IO_IMU_SOURCE = 1 << ITER_SOURCE, 26 + }; 27 + 24 28 struct io_mapped_ubuf { 25 29 u64 ubuf; 26 30 unsigned int len; ··· 33 27 unsigned int folio_shift; 34 28 refcount_t refs; 35 29 unsigned long acct_pages; 30 + void (*release)(void *); 31 + void *priv; 32 + bool is_kbuf; 33 + u8 dir; 36 34 struct bio_vec bvec[] __counted_by(nr_bvecs); 37 35 }; 38 36 ··· 49 39 unsigned int nr_folios; 50 40 }; 51 41 52 - struct io_rsrc_node *io_rsrc_node_alloc(int type); 42 + bool io_rsrc_cache_init(struct io_ring_ctx *ctx); 43 + void io_rsrc_cache_free(struct io_ring_ctx *ctx); 44 + struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); 53 45 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); 54 46 void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); 55 47 int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); 56 48 57 - int io_import_fixed(int ddir, struct iov_iter *iter, 58 - struct io_mapped_ubuf *imu, 59 - u64 buf_addr, size_t len); 49 + struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 50 + unsigned issue_flags); 51 + int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 52 + u64 buf_addr, size_t len, int ddir, 53 + unsigned issue_flags); 60 54 61 55 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); 62 56 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); ··· 91 77 static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 92 78 { 93 79 lockdep_assert_held(&ctx->uring_lock); 94 - if (node && !--node->refs) 80 + if (!--node->refs) 95 81 io_free_rsrc_node(ctx, node); 96 82 } 97 83
+108 -92
io_uring/rw.c
··· 49 49 return false; 50 50 } 51 51 52 - #ifdef CONFIG_COMPAT 53 52 static int io_iov_compat_buffer_select_prep(struct io_rw *rw) 54 53 { 55 - struct compat_iovec __user *uiov; 56 - compat_ssize_t clen; 54 + struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); 55 + struct compat_iovec iov; 57 56 58 - uiov = u64_to_user_ptr(rw->addr); 59 - if (!access_ok(uiov, sizeof(*uiov))) 57 + if (copy_from_user(&iov, uiov, sizeof(iov))) 60 58 return -EFAULT; 61 - if (__get_user(clen, &uiov->iov_len)) 62 - return -EFAULT; 63 - if (clen < 0) 64 - return -EINVAL; 65 - 66 - rw->len = clen; 59 + rw->len = iov.iov_len; 67 60 return 0; 68 61 } 69 - #endif 70 62 71 63 static int io_iov_buffer_select_prep(struct io_kiocb *req) 72 64 { ··· 69 77 if (rw->len != 1) 70 78 return -EINVAL; 71 79 72 - #ifdef CONFIG_COMPAT 73 - if (req->ctx->compat) 80 + if (io_is_compat(req->ctx)) 74 81 return io_iov_compat_buffer_select_prep(rw); 75 - #endif 76 82 77 83 uiov = u64_to_user_ptr(rw->addr); 78 84 if (copy_from_user(&iov, uiov, sizeof(*uiov))) ··· 79 89 return 0; 80 90 } 81 91 82 - static int __io_import_iovec(int ddir, struct io_kiocb *req, 83 - struct io_async_rw *io, 84 - unsigned int issue_flags) 92 + static int io_import_vec(int ddir, struct io_kiocb *req, 93 + struct io_async_rw *io, 94 + const struct iovec __user *uvec, 95 + size_t uvec_segs) 85 96 { 86 - const struct io_issue_def *def = &io_issue_defs[req->opcode]; 87 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 97 + int ret, nr_segs; 88 98 struct iovec *iov; 89 - void __user *buf; 90 - int nr_segs, ret; 91 - size_t sqe_len; 92 - 93 - buf = u64_to_user_ptr(rw->addr); 94 - sqe_len = rw->len; 95 - 96 - if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { 97 - if (io_do_buffer_select(req)) { 98 - buf = io_buffer_select(req, &sqe_len, issue_flags); 99 - if (!buf) 100 - return -ENOBUFS; 101 - rw->addr = (unsigned long) buf; 102 - rw->len = sqe_len; 103 - } 104 - 105 - return import_ubuf(ddir, buf, sqe_len, &io->iter); 106 - } 107 99 108 100 if (io->free_iovec) { 109 101 nr_segs = io->free_iov_nr; 110 102 iov = io->free_iovec; 111 103 } else { 112 - iov = &io->fast_iov; 113 104 nr_segs = 1; 105 + iov = &io->fast_iov; 114 106 } 115 - ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, 116 - req->ctx->compat); 107 + 108 + ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, 109 + io_is_compat(req->ctx)); 117 110 if (unlikely(ret < 0)) 118 111 return ret; 119 112 if (iov) { ··· 108 135 return 0; 109 136 } 110 137 111 - static inline int io_import_iovec(int rw, struct io_kiocb *req, 112 - struct io_async_rw *io, 113 - unsigned int issue_flags) 138 + static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, 139 + struct io_async_rw *io, 140 + unsigned int issue_flags) 141 + { 142 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 143 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 144 + void __user *buf = u64_to_user_ptr(rw->addr); 145 + size_t sqe_len = rw->len; 146 + 147 + if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) 148 + return io_import_vec(ddir, req, io, buf, sqe_len); 149 + 150 + if (io_do_buffer_select(req)) { 151 + buf = io_buffer_select(req, &sqe_len, issue_flags); 152 + if (!buf) 153 + return -ENOBUFS; 154 + rw->addr = (unsigned long) buf; 155 + rw->len = sqe_len; 156 + } 157 + return import_ubuf(ddir, buf, sqe_len, &io->iter); 158 + } 159 + 160 + static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, 161 + struct io_async_rw *io, 162 + unsigned int issue_flags) 114 163 { 115 164 int ret; 116 165 117 - ret = __io_import_iovec(rw, req, io, issue_flags); 166 + ret = __io_import_rw_buffer(rw, req, io, issue_flags); 118 167 if (unlikely(ret < 0)) 119 168 return ret; 120 169 ··· 207 212 return 0; 208 213 } 209 214 210 - static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) 211 - { 212 - struct io_async_rw *rw; 213 - 214 - if (io_rw_alloc_async(req)) 215 - return -ENOMEM; 216 - 217 - if (!do_import || io_do_buffer_select(req)) 218 - return 0; 219 - 220 - rw = req->async_data; 221 - return io_import_iovec(ddir, req, rw, 0); 222 - } 223 - 224 215 static inline void io_meta_save_state(struct io_async_rw *io) 225 216 { 226 217 io->meta_state.seed = io->meta.seed; ··· 248 267 return ret; 249 268 } 250 269 251 - static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 252 - int ddir, bool do_import) 270 + static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 271 + int ddir) 253 272 { 254 273 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 255 274 unsigned ioprio; 256 275 u64 attr_type_mask; 257 276 int ret; 277 + 278 + if (io_rw_alloc_async(req)) 279 + return -ENOMEM; 258 280 259 281 rw->kiocb.ki_pos = READ_ONCE(sqe->off); 260 282 /* used for fixed read/write too - just read unconditionally */ ··· 284 300 rw->addr = READ_ONCE(sqe->addr); 285 301 rw->len = READ_ONCE(sqe->len); 286 302 rw->flags = READ_ONCE(sqe->rw_flags); 287 - ret = io_prep_rw_setup(req, ddir, do_import); 288 - 289 - if (unlikely(ret)) 290 - return ret; 291 303 292 304 attr_type_mask = READ_ONCE(sqe->attr_type_mask); 293 305 if (attr_type_mask) { ··· 294 314 return -EINVAL; 295 315 296 316 attr_ptr = READ_ONCE(sqe->attr_ptr); 297 - ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); 317 + return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); 298 318 } 299 - return ret; 319 + return 0; 320 + } 321 + 322 + static int io_rw_do_import(struct io_kiocb *req, int ddir) 323 + { 324 + if (io_do_buffer_select(req)) 325 + return 0; 326 + 327 + return io_import_rw_buffer(ddir, req, req->async_data, 0); 328 + } 329 + 330 + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 331 + int ddir) 332 + { 333 + int ret; 334 + 335 + ret = __io_prep_rw(req, sqe, ddir); 336 + if (unlikely(ret)) 337 + return ret; 338 + 339 + return io_rw_do_import(req, ddir); 300 340 } 301 341 302 342 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) 303 343 { 304 - return io_prep_rw(req, sqe, ITER_DEST, true); 344 + return io_prep_rw(req, sqe, ITER_DEST); 305 345 } 306 346 307 347 int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) 308 348 { 309 - return io_prep_rw(req, sqe, ITER_SOURCE, true); 349 + return io_prep_rw(req, sqe, ITER_SOURCE); 310 350 } 311 351 312 352 static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, 313 353 int ddir) 314 354 { 315 - const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); 316 355 int ret; 317 356 318 - ret = io_prep_rw(req, sqe, ddir, do_import); 357 + ret = io_prep_rw(req, sqe, ddir); 319 358 if (unlikely(ret)) 320 359 return ret; 321 - if (do_import) 360 + if (!(req->flags & REQ_F_BUFFER_SELECT)) 322 361 return 0; 323 362 324 363 /* ··· 357 358 return io_prep_rwv(req, sqe, ITER_SOURCE); 358 359 } 359 360 360 - static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, 361 + static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, 361 362 int ddir) 362 363 { 363 364 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 364 - struct io_ring_ctx *ctx = req->ctx; 365 - struct io_rsrc_node *node; 366 - struct io_async_rw *io; 365 + struct io_async_rw *io = req->async_data; 367 366 int ret; 368 367 369 - ret = io_prep_rw(req, sqe, ddir, false); 370 - if (unlikely(ret)) 371 - return ret; 368 + if (io->bytes_done) 369 + return 0; 372 370 373 - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 374 - if (!node) 375 - return -EFAULT; 376 - io_req_assign_buf_node(req, node); 377 - 378 - io = req->async_data; 379 - ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); 371 + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, 372 + issue_flags); 380 373 iov_iter_save_state(&io->iter, &io->iter_state); 381 374 return ret; 382 375 } 383 376 384 377 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 385 378 { 386 - return io_prep_rw_fixed(req, sqe, ITER_DEST); 379 + return __io_prep_rw(req, sqe, ITER_DEST); 387 380 } 388 381 389 382 int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 390 383 { 391 - return io_prep_rw_fixed(req, sqe, ITER_SOURCE); 384 + return __io_prep_rw(req, sqe, ITER_SOURCE); 392 385 } 393 386 394 387 /* ··· 396 405 if (!(req->flags & REQ_F_BUFFER_SELECT)) 397 406 return -EINVAL; 398 407 399 - ret = io_prep_rw(req, sqe, ITER_DEST, false); 408 + ret = __io_prep_rw(req, sqe, ITER_DEST); 400 409 if (unlikely(ret)) 401 410 return ret; 402 411 ··· 510 519 return res; 511 520 } 512 521 513 - void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) 522 + void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) 514 523 { 515 524 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 516 525 struct kiocb *kiocb = &rw->kiocb; ··· 527 536 req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); 528 537 529 538 io_req_rw_cleanup(req, 0); 530 - io_req_task_complete(req, ts); 539 + io_req_task_complete(req, tw); 531 540 } 532 541 533 542 static void io_complete_rw(struct kiocb *kiocb, long res) ··· 628 637 */ 629 638 static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) 630 639 { 640 + struct io_kiocb *req = cmd_to_io_kiocb(rw); 631 641 struct kiocb *kiocb = &rw->kiocb; 632 642 struct file *file = kiocb->ki_filp; 633 643 ssize_t ret = 0; ··· 644 652 if ((kiocb->ki_flags & IOCB_NOWAIT) && 645 653 !(kiocb->ki_filp->f_flags & O_NONBLOCK)) 646 654 return -EAGAIN; 655 + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) 656 + return -EFAULT; 647 657 648 658 ppos = io_kiocb_ppos(kiocb); 649 659 ··· 857 863 loff_t *ppos; 858 864 859 865 if (io_do_buffer_select(req)) { 860 - ret = io_import_iovec(ITER_DEST, req, io, issue_flags); 866 + ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); 861 867 if (unlikely(ret < 0)) 862 868 return ret; 863 869 } ··· 1146 1152 io_req_end_write(req); 1147 1153 return -EAGAIN; 1148 1154 } 1155 + } 1156 + 1157 + int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) 1158 + { 1159 + int ret; 1160 + 1161 + ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); 1162 + if (unlikely(ret)) 1163 + return ret; 1164 + 1165 + return io_read(req, issue_flags); 1166 + } 1167 + 1168 + int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) 1169 + { 1170 + int ret; 1171 + 1172 + ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); 1173 + if (unlikely(ret)) 1174 + return ret; 1175 + 1176 + return io_write(req, issue_flags); 1149 1177 } 1150 1178 1151 1179 void io_rw_fail(struct io_kiocb *req)
+4 -1
io_uring/rw.h
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + #include <linux/io_uring_types.h> 3 4 #include <linux/pagemap.h> 4 5 5 6 struct io_meta_state { ··· 38 37 int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); 39 38 int io_read(struct io_kiocb *req, unsigned int issue_flags); 40 39 int io_write(struct io_kiocb *req, unsigned int issue_flags); 40 + int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); 41 + int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); 41 42 void io_readv_writev_cleanup(struct io_kiocb *req); 42 43 void io_rw_fail(struct io_kiocb *req); 43 - void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); 44 + void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); 44 45 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 45 46 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); 46 47 void io_rw_cache_free(const void *entry);
+2 -1
io_uring/splice.c
··· 51 51 { 52 52 struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); 53 53 54 - io_put_rsrc_node(req->ctx, sp->rsrc_node); 54 + if (sp->rsrc_node) 55 + io_put_rsrc_node(req->ctx, sp->rsrc_node); 55 56 } 56 57 57 58 static struct file *io_splice_get_file(struct io_kiocb *req,
+8 -8
io_uring/timeout.c
··· 65 65 66 66 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); 67 67 68 - static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) 68 + static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) 69 69 { 70 70 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 71 71 struct io_timeout_data *data = req->async_data; ··· 82 82 } 83 83 } 84 84 85 - io_req_task_complete(req, ts); 85 + io_req_task_complete(req, tw); 86 86 } 87 87 88 88 static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) ··· 154 154 io_flush_killed_timeouts(&list, 0); 155 155 } 156 156 157 - static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) 157 + static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) 158 158 { 159 - io_tw_lock(link->ctx, ts); 159 + io_tw_lock(link->ctx, tw); 160 160 while (link) { 161 161 struct io_kiocb *nxt = link->link; 162 162 long res = -ECANCELED; ··· 165 165 res = link->cqe.res; 166 166 link->link = NULL; 167 167 io_req_set_res(link, res, 0); 168 - io_req_task_complete(link, ts); 168 + io_req_task_complete(link, tw); 169 169 link = nxt; 170 170 } 171 171 } ··· 312 312 return 0; 313 313 } 314 314 315 - static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) 315 + static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) 316 316 { 317 317 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 318 318 struct io_kiocb *prev = timeout->prev; ··· 330 330 ret = -ECANCELED; 331 331 } 332 332 io_req_set_res(req, ret ?: -ETIME, 0); 333 - io_req_task_complete(req, ts); 333 + io_req_task_complete(req, tw); 334 334 io_put_req(prev); 335 335 } else { 336 336 io_req_set_res(req, -ETIME, 0); 337 - io_req_task_complete(req, ts); 337 + io_req_task_complete(req, tw); 338 338 } 339 339 } 340 340
+8 -23
io_uring/uring_cmd.c
··· 102 102 } 103 103 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); 104 104 105 - static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) 105 + static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) 106 106 { 107 107 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 108 108 unsigned int flags = IO_URING_F_COMPLETE_DEFER; ··· 199 199 if (ioucmd->flags & ~IORING_URING_CMD_MASK) 200 200 return -EINVAL; 201 201 202 - if (ioucmd->flags & IORING_URING_CMD_FIXED) { 203 - struct io_ring_ctx *ctx = req->ctx; 204 - struct io_rsrc_node *node; 205 - u16 index = READ_ONCE(sqe->buf_index); 202 + if (ioucmd->flags & IORING_URING_CMD_FIXED) 203 + req->buf_index = READ_ONCE(sqe->buf_index); 206 204 207 - node = io_rsrc_node_lookup(&ctx->buf_table, index); 208 - if (unlikely(!node)) 209 - return -EFAULT; 210 - /* 211 - * Pi node upfront, prior to io_uring_cmd_import_fixed() 212 - * being called. This prevents destruction of the mapped buffer 213 - * we'll need at actual import time. 214 - */ 215 - io_req_assign_buf_node(req, node); 216 - } 217 205 ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); 218 206 219 207 return io_uring_cmd_prep_setup(req, sqe); ··· 225 237 issue_flags |= IO_URING_F_SQE128; 226 238 if (ctx->flags & IORING_SETUP_CQE32) 227 239 issue_flags |= IO_URING_F_CQE32; 228 - if (ctx->compat) 240 + if (io_is_compat(ctx)) 229 241 issue_flags |= IO_URING_F_COMPAT; 230 242 if (ctx->flags & IORING_SETUP_IOPOLL) { 231 243 if (!file->f_op->uring_cmd_iopoll) ··· 245 257 } 246 258 247 259 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 248 - struct iov_iter *iter, void *ioucmd) 260 + struct iov_iter *iter, 261 + struct io_uring_cmd *ioucmd, 262 + unsigned int issue_flags) 249 263 { 250 264 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 251 - struct io_rsrc_node *node = req->buf_node; 252 265 253 - /* Must have had rsrc_node assigned at prep time */ 254 - if (node) 255 - return io_import_fixed(rw, iter, node->buf, ubuf, len); 256 - 257 - return -EFAULT; 266 + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); 258 267 } 259 268 EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); 260 269
+8 -48
io_uring/waitid.c
··· 16 16 #include "waitid.h" 17 17 #include "../kernel/exit.h" 18 18 19 - static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); 19 + static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); 20 20 21 21 #define IO_WAITID_CANCEL_FLAG BIT(31) 22 22 #define IO_WAITID_REF_MASK GENMASK(30, 0) ··· 42 42 req->flags &= ~REQ_F_ASYNC_DATA; 43 43 } 44 44 45 - #ifdef CONFIG_COMPAT 46 45 static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) 47 46 { 48 47 struct compat_siginfo __user *infop; ··· 66 67 ret = false; 67 68 goto done; 68 69 } 69 - #endif 70 70 71 71 static bool io_waitid_copy_si(struct io_kiocb *req, int signo) 72 72 { ··· 75 77 if (!iw->infop) 76 78 return true; 77 79 78 - #ifdef CONFIG_COMPAT 79 - if (req->ctx->compat) 80 + if (io_is_compat(req->ctx)) 80 81 return io_waitid_compat_copy_si(iw, signo); 81 - #endif 82 82 83 83 if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) 84 84 return false; ··· 128 132 io_req_set_res(req, ret, 0); 129 133 } 130 134 131 - static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 135 + static bool __io_waitid_cancel(struct io_kiocb *req) 132 136 { 133 137 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 134 138 struct io_waitid_async *iwa = req->async_data; ··· 154 158 int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 155 159 unsigned int issue_flags) 156 160 { 157 - struct hlist_node *tmp; 158 - struct io_kiocb *req; 159 - int nr = 0; 160 - 161 - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) 162 - return -ENOENT; 163 - 164 - io_ring_submit_lock(ctx, issue_flags); 165 - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 166 - if (req->cqe.user_data != cd->data && 167 - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) 168 - continue; 169 - if (__io_waitid_cancel(ctx, req)) 170 - nr++; 171 - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 172 - break; 173 - } 174 - io_ring_submit_unlock(ctx, issue_flags); 175 - 176 - if (nr) 177 - return nr; 178 - 179 - return -ENOENT; 161 + return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); 180 162 } 181 163 182 164 bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 183 165 bool cancel_all) 184 166 { 185 - struct hlist_node *tmp; 186 - struct io_kiocb *req; 187 - bool found = false; 188 - 189 - lockdep_assert_held(&ctx->uring_lock); 190 - 191 - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 192 - if (!io_match_task_safe(req, tctx, cancel_all)) 193 - continue; 194 - hlist_del_init(&req->hash_node); 195 - __io_waitid_cancel(ctx, req); 196 - found = true; 197 - } 198 - 199 - return found; 167 + return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); 200 168 } 201 169 202 170 static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) ··· 181 221 return true; 182 222 } 183 223 184 - static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) 224 + static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) 185 225 { 186 226 struct io_waitid_async *iwa = req->async_data; 187 227 struct io_ring_ctx *ctx = req->ctx; 188 228 int ret; 189 229 190 - io_tw_lock(ctx, ts); 230 + io_tw_lock(ctx, tw); 191 231 192 232 ret = __do_wait(&iwa->wo); 193 233 ··· 217 257 } 218 258 219 259 io_waitid_complete(req, ret); 220 - io_req_task_complete(req, ts); 260 + io_req_task_complete(req, tw); 221 261 } 222 262 223 263 static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+1
tools/testing/selftests/Makefile
··· 114 114 TARGETS += tmpfs 115 115 TARGETS += tpm2 116 116 TARGETS += tty 117 + TARGETS += ublk 117 118 TARGETS += uevent 118 119 TARGETS += user_events 119 120 TARGETS += vDSO
+3
tools/testing/selftests/ublk/.gitignore
··· 1 + kublk 2 + /tools 3 + *-verify.state
+27
tools/testing/selftests/ublk/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) 4 + LDLIBS += -lpthread -lm -luring 5 + 6 + TEST_PROGS := test_generic_01.sh 7 + 8 + TEST_PROGS += test_null_01.sh 9 + TEST_PROGS += test_null_02.sh 10 + TEST_PROGS += test_loop_01.sh 11 + TEST_PROGS += test_loop_02.sh 12 + TEST_PROGS += test_loop_03.sh 13 + TEST_PROGS += test_loop_04.sh 14 + TEST_PROGS += test_stripe_01.sh 15 + TEST_PROGS += test_stripe_02.sh 16 + 17 + TEST_PROGS += test_stress_01.sh 18 + TEST_PROGS += test_stress_02.sh 19 + 20 + TEST_GEN_PROGS_EXTENDED = kublk 21 + 22 + include ../lib.mk 23 + 24 + $(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c 25 + 26 + check: 27 + shellcheck -x -f gcc *.sh
+55
tools/testing/selftests/ublk/common.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "kublk.h" 4 + 5 + void backing_file_tgt_deinit(struct ublk_dev *dev) 6 + { 7 + int i; 8 + 9 + for (i = 1; i < dev->nr_fds; i++) { 10 + fsync(dev->fds[i]); 11 + close(dev->fds[i]); 12 + } 13 + } 14 + 15 + int backing_file_tgt_init(struct ublk_dev *dev) 16 + { 17 + int fd, i; 18 + 19 + assert(dev->nr_fds == 1); 20 + 21 + for (i = 0; i < dev->tgt.nr_backing_files; i++) { 22 + char *file = dev->tgt.backing_file[i]; 23 + unsigned long bytes; 24 + struct stat st; 25 + 26 + ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); 27 + 28 + fd = open(file, O_RDWR | O_DIRECT); 29 + if (fd < 0) { 30 + ublk_err("%s: backing file %s can't be opened: %s\n", 31 + __func__, file, strerror(errno)); 32 + return -EBADF; 33 + } 34 + 35 + if (fstat(fd, &st) < 0) { 36 + close(fd); 37 + return -EBADF; 38 + } 39 + 40 + if (S_ISREG(st.st_mode)) 41 + bytes = st.st_size; 42 + else if (S_ISBLK(st.st_mode)) { 43 + if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) 44 + return -1; 45 + } else { 46 + return -EINVAL; 47 + } 48 + 49 + dev->tgt.backing_file_size[i] = bytes; 50 + dev->fds[dev->nr_fds] = fd; 51 + dev->nr_fds += 1; 52 + } 53 + 54 + return 0; 55 + }
+1
tools/testing/selftests/ublk/config
··· 1 + CONFIG_BLK_DEV_UBLK=m
+169
tools/testing/selftests/ublk/file_backed.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "kublk.h" 4 + 5 + static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int zc) 6 + { 7 + unsigned ublk_op = ublksrv_get_op(iod); 8 + 9 + if (ublk_op == UBLK_IO_OP_READ) 10 + return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; 11 + else if (ublk_op == UBLK_IO_OP_WRITE) 12 + return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; 13 + assert(0); 14 + } 15 + 16 + static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) 17 + { 18 + unsigned ublk_op = ublksrv_get_op(iod); 19 + struct io_uring_sqe *sqe[1]; 20 + 21 + ublk_queue_alloc_sqes(q, sqe, 1); 22 + io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); 23 + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); 24 + /* bit63 marks us as tgt io */ 25 + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); 26 + return 1; 27 + } 28 + 29 + static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) 30 + { 31 + unsigned ublk_op = ublksrv_get_op(iod); 32 + int zc = ublk_queue_use_zc(q); 33 + enum io_uring_op op = ublk_to_uring_op(iod, zc); 34 + struct io_uring_sqe *sqe[3]; 35 + 36 + if (!zc) { 37 + ublk_queue_alloc_sqes(q, sqe, 1); 38 + if (!sqe[0]) 39 + return -ENOMEM; 40 + 41 + io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/, 42 + (void *)iod->addr, 43 + iod->nr_sectors << 9, 44 + iod->start_sector << 9); 45 + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); 46 + /* bit63 marks us as tgt io */ 47 + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); 48 + return 1; 49 + } 50 + 51 + ublk_queue_alloc_sqes(q, sqe, 3); 52 + 53 + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); 54 + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; 55 + sqe[0]->user_data = build_user_data(tag, 56 + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); 57 + 58 + io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, 59 + iod->nr_sectors << 9, 60 + iod->start_sector << 9); 61 + sqe[1]->buf_index = tag; 62 + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; 63 + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); 64 + 65 + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); 66 + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); 67 + 68 + return 2; 69 + } 70 + 71 + static int loop_queue_tgt_io(struct ublk_queue *q, int tag) 72 + { 73 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 74 + unsigned ublk_op = ublksrv_get_op(iod); 75 + int ret; 76 + 77 + switch (ublk_op) { 78 + case UBLK_IO_OP_FLUSH: 79 + ret = loop_queue_flush_io(q, iod, tag); 80 + break; 81 + case UBLK_IO_OP_WRITE_ZEROES: 82 + case UBLK_IO_OP_DISCARD: 83 + ret = -ENOTSUP; 84 + break; 85 + case UBLK_IO_OP_READ: 86 + case UBLK_IO_OP_WRITE: 87 + ret = loop_queue_tgt_rw_io(q, iod, tag); 88 + break; 89 + default: 90 + ret = -EINVAL; 91 + break; 92 + } 93 + 94 + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, 95 + iod->op_flags, iod->start_sector, iod->nr_sectors << 9); 96 + return ret; 97 + } 98 + 99 + static int ublk_loop_queue_io(struct ublk_queue *q, int tag) 100 + { 101 + int queued = loop_queue_tgt_io(q, tag); 102 + 103 + ublk_queued_tgt_io(q, tag, queued); 104 + return 0; 105 + } 106 + 107 + static void ublk_loop_io_done(struct ublk_queue *q, int tag, 108 + const struct io_uring_cqe *cqe) 109 + { 110 + unsigned op = user_data_to_op(cqe->user_data); 111 + struct ublk_io *io = ublk_get_io(q, tag); 112 + 113 + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { 114 + if (!io->result) 115 + io->result = cqe->res; 116 + if (cqe->res < 0) 117 + ublk_err("%s: io failed op %x user_data %lx\n", 118 + __func__, op, cqe->user_data); 119 + } 120 + 121 + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ 122 + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) 123 + io->tgt_ios += 1; 124 + 125 + if (ublk_completed_tgt_io(q, tag)) 126 + ublk_complete_io(q, tag, io->result); 127 + } 128 + 129 + static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) 130 + { 131 + unsigned long long bytes; 132 + int ret; 133 + struct ublk_params p = { 134 + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, 135 + .basic = { 136 + .attrs = UBLK_ATTR_VOLATILE_CACHE, 137 + .logical_bs_shift = 9, 138 + .physical_bs_shift = 12, 139 + .io_opt_shift = 12, 140 + .io_min_shift = 9, 141 + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, 142 + }, 143 + .dma = { 144 + .alignment = 511, 145 + }, 146 + }; 147 + 148 + ret = backing_file_tgt_init(dev); 149 + if (ret) 150 + return ret; 151 + 152 + if (dev->tgt.nr_backing_files != 1) 153 + return -EINVAL; 154 + 155 + bytes = dev->tgt.backing_file_size[0]; 156 + dev->tgt.dev_size = bytes; 157 + p.basic.dev_sectors = bytes >> 9; 158 + dev->tgt.params = p; 159 + 160 + return 0; 161 + } 162 + 163 + const struct ublk_tgt_ops loop_tgt_ops = { 164 + .name = "loop", 165 + .init_tgt = ublk_loop_tgt_init, 166 + .deinit_tgt = backing_file_tgt_deinit, 167 + .queue_io = ublk_loop_queue_io, 168 + .tgt_io_done = ublk_loop_io_done, 169 + };
+1138
tools/testing/selftests/ublk/kublk.c
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Description: uring_cmd based ublk 4 + */ 5 + 6 + #include "kublk.h" 7 + 8 + unsigned int ublk_dbg_mask = UBLK_LOG; 9 + static const struct ublk_tgt_ops *tgt_ops_list[] = { 10 + &null_tgt_ops, 11 + &loop_tgt_ops, 12 + &stripe_tgt_ops, 13 + }; 14 + 15 + static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) 16 + { 17 + const struct ublk_tgt_ops *ops; 18 + int i; 19 + 20 + if (name == NULL) 21 + return NULL; 22 + 23 + for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++) 24 + if (strcmp(tgt_ops_list[i]->name, name) == 0) 25 + return tgt_ops_list[i]; 26 + return NULL; 27 + } 28 + 29 + static inline int ublk_setup_ring(struct io_uring *r, int depth, 30 + int cq_depth, unsigned flags) 31 + { 32 + struct io_uring_params p; 33 + 34 + memset(&p, 0, sizeof(p)); 35 + p.flags = flags | IORING_SETUP_CQSIZE; 36 + p.cq_entries = cq_depth; 37 + 38 + return io_uring_queue_init_params(depth, r, &p); 39 + } 40 + 41 + static void ublk_ctrl_init_cmd(struct ublk_dev *dev, 42 + struct io_uring_sqe *sqe, 43 + struct ublk_ctrl_cmd_data *data) 44 + { 45 + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 46 + struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); 47 + 48 + sqe->fd = dev->ctrl_fd; 49 + sqe->opcode = IORING_OP_URING_CMD; 50 + sqe->ioprio = 0; 51 + 52 + if (data->flags & CTRL_CMD_HAS_BUF) { 53 + cmd->addr = data->addr; 54 + cmd->len = data->len; 55 + } 56 + 57 + if (data->flags & CTRL_CMD_HAS_DATA) 58 + cmd->data[0] = data->data[0]; 59 + 60 + cmd->dev_id = info->dev_id; 61 + cmd->queue_id = -1; 62 + 63 + ublk_set_sqe_cmd_op(sqe, data->cmd_op); 64 + 65 + io_uring_sqe_set_data(sqe, cmd); 66 + } 67 + 68 + static int __ublk_ctrl_cmd(struct ublk_dev *dev, 69 + struct ublk_ctrl_cmd_data *data) 70 + { 71 + struct io_uring_sqe *sqe; 72 + struct io_uring_cqe *cqe; 73 + int ret = -EINVAL; 74 + 75 + sqe = io_uring_get_sqe(&dev->ring); 76 + if (!sqe) { 77 + ublk_err("%s: can't get sqe ret %d\n", __func__, ret); 78 + return ret; 79 + } 80 + 81 + ublk_ctrl_init_cmd(dev, sqe, data); 82 + 83 + ret = io_uring_submit(&dev->ring); 84 + if (ret < 0) { 85 + ublk_err("uring submit ret %d\n", ret); 86 + return ret; 87 + } 88 + 89 + ret = io_uring_wait_cqe(&dev->ring, &cqe); 90 + if (ret < 0) { 91 + ublk_err("wait cqe: %s\n", strerror(-ret)); 92 + return ret; 93 + } 94 + io_uring_cqe_seen(&dev->ring, cqe); 95 + 96 + return cqe->res; 97 + } 98 + 99 + static int ublk_ctrl_stop_dev(struct ublk_dev *dev) 100 + { 101 + struct ublk_ctrl_cmd_data data = { 102 + .cmd_op = UBLK_CMD_STOP_DEV, 103 + }; 104 + 105 + return __ublk_ctrl_cmd(dev, &data); 106 + } 107 + 108 + static int ublk_ctrl_start_dev(struct ublk_dev *dev, 109 + int daemon_pid) 110 + { 111 + struct ublk_ctrl_cmd_data data = { 112 + .cmd_op = UBLK_U_CMD_START_DEV, 113 + .flags = CTRL_CMD_HAS_DATA, 114 + }; 115 + 116 + dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; 117 + 118 + return __ublk_ctrl_cmd(dev, &data); 119 + } 120 + 121 + static int ublk_ctrl_add_dev(struct ublk_dev *dev) 122 + { 123 + struct ublk_ctrl_cmd_data data = { 124 + .cmd_op = UBLK_U_CMD_ADD_DEV, 125 + .flags = CTRL_CMD_HAS_BUF, 126 + .addr = (__u64) (uintptr_t) &dev->dev_info, 127 + .len = sizeof(struct ublksrv_ctrl_dev_info), 128 + }; 129 + 130 + return __ublk_ctrl_cmd(dev, &data); 131 + } 132 + 133 + static int ublk_ctrl_del_dev(struct ublk_dev *dev) 134 + { 135 + struct ublk_ctrl_cmd_data data = { 136 + .cmd_op = UBLK_U_CMD_DEL_DEV, 137 + .flags = 0, 138 + }; 139 + 140 + return __ublk_ctrl_cmd(dev, &data); 141 + } 142 + 143 + static int ublk_ctrl_get_info(struct ublk_dev *dev) 144 + { 145 + struct ublk_ctrl_cmd_data data = { 146 + .cmd_op = UBLK_U_CMD_GET_DEV_INFO, 147 + .flags = CTRL_CMD_HAS_BUF, 148 + .addr = (__u64) (uintptr_t) &dev->dev_info, 149 + .len = sizeof(struct ublksrv_ctrl_dev_info), 150 + }; 151 + 152 + return __ublk_ctrl_cmd(dev, &data); 153 + } 154 + 155 + static int ublk_ctrl_set_params(struct ublk_dev *dev, 156 + struct ublk_params *params) 157 + { 158 + struct ublk_ctrl_cmd_data data = { 159 + .cmd_op = UBLK_U_CMD_SET_PARAMS, 160 + .flags = CTRL_CMD_HAS_BUF, 161 + .addr = (__u64) (uintptr_t) params, 162 + .len = sizeof(*params), 163 + }; 164 + params->len = sizeof(*params); 165 + return __ublk_ctrl_cmd(dev, &data); 166 + } 167 + 168 + static int ublk_ctrl_get_params(struct ublk_dev *dev, 169 + struct ublk_params *params) 170 + { 171 + struct ublk_ctrl_cmd_data data = { 172 + .cmd_op = UBLK_CMD_GET_PARAMS, 173 + .flags = CTRL_CMD_HAS_BUF, 174 + .addr = (__u64)params, 175 + .len = sizeof(*params), 176 + }; 177 + 178 + params->len = sizeof(*params); 179 + 180 + return __ublk_ctrl_cmd(dev, &data); 181 + } 182 + 183 + static int ublk_ctrl_get_features(struct ublk_dev *dev, 184 + __u64 *features) 185 + { 186 + struct ublk_ctrl_cmd_data data = { 187 + .cmd_op = UBLK_U_CMD_GET_FEATURES, 188 + .flags = CTRL_CMD_HAS_BUF, 189 + .addr = (__u64) (uintptr_t) features, 190 + .len = sizeof(*features), 191 + }; 192 + 193 + return __ublk_ctrl_cmd(dev, &data); 194 + } 195 + 196 + static const char *ublk_dev_state_desc(struct ublk_dev *dev) 197 + { 198 + switch (dev->dev_info.state) { 199 + case UBLK_S_DEV_DEAD: 200 + return "DEAD"; 201 + case UBLK_S_DEV_LIVE: 202 + return "LIVE"; 203 + case UBLK_S_DEV_QUIESCED: 204 + return "QUIESCED"; 205 + default: 206 + return "UNKNOWN"; 207 + }; 208 + } 209 + 210 + static void ublk_ctrl_dump(struct ublk_dev *dev) 211 + { 212 + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 213 + struct ublk_params p; 214 + int ret; 215 + 216 + ret = ublk_ctrl_get_params(dev, &p); 217 + if (ret < 0) { 218 + ublk_err("failed to get params %m\n"); 219 + return; 220 + } 221 + 222 + ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", 223 + info->dev_id, info->nr_hw_queues, info->queue_depth, 224 + 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); 225 + ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", 226 + info->max_io_buf_bytes, info->ublksrv_pid, info->flags, 227 + ublk_dev_state_desc(dev)); 228 + fflush(stdout); 229 + } 230 + 231 + static void ublk_ctrl_deinit(struct ublk_dev *dev) 232 + { 233 + close(dev->ctrl_fd); 234 + free(dev); 235 + } 236 + 237 + static struct ublk_dev *ublk_ctrl_init(void) 238 + { 239 + struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); 240 + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 241 + int ret; 242 + 243 + dev->ctrl_fd = open(CTRL_DEV, O_RDWR); 244 + if (dev->ctrl_fd < 0) { 245 + free(dev); 246 + return NULL; 247 + } 248 + 249 + info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; 250 + 251 + ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, 252 + UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); 253 + if (ret < 0) { 254 + ublk_err("queue_init: %s\n", strerror(-ret)); 255 + free(dev); 256 + return NULL; 257 + } 258 + dev->nr_fds = 1; 259 + 260 + return dev; 261 + } 262 + 263 + static int __ublk_queue_cmd_buf_sz(unsigned depth) 264 + { 265 + int size = depth * sizeof(struct ublksrv_io_desc); 266 + unsigned int page_sz = getpagesize(); 267 + 268 + return round_up(size, page_sz); 269 + } 270 + 271 + static int ublk_queue_max_cmd_buf_sz(void) 272 + { 273 + return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); 274 + } 275 + 276 + static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) 277 + { 278 + return __ublk_queue_cmd_buf_sz(q->q_depth); 279 + } 280 + 281 + static void ublk_queue_deinit(struct ublk_queue *q) 282 + { 283 + int i; 284 + int nr_ios = q->q_depth; 285 + 286 + io_uring_unregister_buffers(&q->ring); 287 + 288 + io_uring_unregister_ring_fd(&q->ring); 289 + 290 + if (q->ring.ring_fd > 0) { 291 + io_uring_unregister_files(&q->ring); 292 + close(q->ring.ring_fd); 293 + q->ring.ring_fd = -1; 294 + } 295 + 296 + if (q->io_cmd_buf) 297 + munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); 298 + 299 + for (i = 0; i < nr_ios; i++) 300 + free(q->ios[i].buf_addr); 301 + } 302 + 303 + static int ublk_queue_init(struct ublk_queue *q) 304 + { 305 + struct ublk_dev *dev = q->dev; 306 + int depth = dev->dev_info.queue_depth; 307 + int i, ret = -1; 308 + int cmd_buf_size, io_buf_size; 309 + unsigned long off; 310 + int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; 311 + 312 + q->tgt_ops = dev->tgt.ops; 313 + q->state = 0; 314 + q->q_depth = depth; 315 + q->cmd_inflight = 0; 316 + q->tid = gettid(); 317 + 318 + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { 319 + q->state |= UBLKSRV_NO_BUF; 320 + q->state |= UBLKSRV_ZC; 321 + } 322 + 323 + cmd_buf_size = ublk_queue_cmd_buf_sz(q); 324 + off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); 325 + q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, 326 + MAP_SHARED | MAP_POPULATE, dev->fds[0], off); 327 + if (q->io_cmd_buf == MAP_FAILED) { 328 + ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", 329 + q->dev->dev_info.dev_id, q->q_id); 330 + goto fail; 331 + } 332 + 333 + io_buf_size = dev->dev_info.max_io_buf_bytes; 334 + for (i = 0; i < q->q_depth; i++) { 335 + q->ios[i].buf_addr = NULL; 336 + q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; 337 + 338 + if (q->state & UBLKSRV_NO_BUF) 339 + continue; 340 + 341 + if (posix_memalign((void **)&q->ios[i].buf_addr, 342 + getpagesize(), io_buf_size)) { 343 + ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", 344 + dev->dev_info.dev_id, q->q_id, i); 345 + goto fail; 346 + } 347 + } 348 + 349 + ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth, 350 + IORING_SETUP_COOP_TASKRUN); 351 + if (ret < 0) { 352 + ublk_err("ublk dev %d queue %d setup io_uring failed %d\n", 353 + q->dev->dev_info.dev_id, q->q_id, ret); 354 + goto fail; 355 + } 356 + 357 + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { 358 + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); 359 + if (ret) { 360 + ublk_err("ublk dev %d queue %d register spare buffers failed %d", 361 + dev->dev_info.dev_id, q->q_id, ret); 362 + goto fail; 363 + } 364 + } 365 + 366 + io_uring_register_ring_fd(&q->ring); 367 + 368 + ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); 369 + if (ret) { 370 + ublk_err("ublk dev %d queue %d register files failed %d\n", 371 + q->dev->dev_info.dev_id, q->q_id, ret); 372 + goto fail; 373 + } 374 + 375 + return 0; 376 + fail: 377 + ublk_queue_deinit(q); 378 + ublk_err("ublk dev %d queue %d failed\n", 379 + dev->dev_info.dev_id, q->q_id); 380 + return -ENOMEM; 381 + } 382 + 383 + #define WAIT_USEC 100000 384 + #define MAX_WAIT_USEC (3 * 1000000) 385 + static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) 386 + { 387 + int dev_id = dev->dev_info.dev_id; 388 + unsigned int wait_usec = 0; 389 + int ret = 0, fd = -1; 390 + char buf[64]; 391 + 392 + snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); 393 + 394 + while (wait_usec < MAX_WAIT_USEC) { 395 + fd = open(buf, O_RDWR); 396 + if (fd >= 0) 397 + break; 398 + usleep(WAIT_USEC); 399 + wait_usec += WAIT_USEC; 400 + } 401 + if (fd < 0) { 402 + ublk_err("can't open %s %s\n", buf, strerror(errno)); 403 + return -1; 404 + } 405 + 406 + dev->fds[0] = fd; 407 + if (dev->tgt.ops->init_tgt) 408 + ret = dev->tgt.ops->init_tgt(ctx, dev); 409 + if (ret) 410 + close(dev->fds[0]); 411 + return ret; 412 + } 413 + 414 + static void ublk_dev_unprep(struct ublk_dev *dev) 415 + { 416 + if (dev->tgt.ops->deinit_tgt) 417 + dev->tgt.ops->deinit_tgt(dev); 418 + close(dev->fds[0]); 419 + } 420 + 421 + int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) 422 + { 423 + struct ublksrv_io_cmd *cmd; 424 + struct io_uring_sqe *sqe[1]; 425 + unsigned int cmd_op = 0; 426 + __u64 user_data; 427 + 428 + /* only freed io can be issued */ 429 + if (!(io->flags & UBLKSRV_IO_FREE)) 430 + return 0; 431 + 432 + /* we issue because we need either fetching or committing */ 433 + if (!(io->flags & 434 + (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP))) 435 + return 0; 436 + 437 + if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) 438 + cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; 439 + else if (io->flags & UBLKSRV_NEED_FETCH_RQ) 440 + cmd_op = UBLK_U_IO_FETCH_REQ; 441 + 442 + if (io_uring_sq_space_left(&q->ring) < 1) 443 + io_uring_submit(&q->ring); 444 + 445 + ublk_queue_alloc_sqes(q, sqe, 1); 446 + if (!sqe[0]) { 447 + ublk_err("%s: run out of sqe %d, tag %d\n", 448 + __func__, q->q_id, tag); 449 + return -1; 450 + } 451 + 452 + cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]); 453 + 454 + if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) 455 + cmd->result = io->result; 456 + 457 + /* These fields should be written once, never change */ 458 + ublk_set_sqe_cmd_op(sqe[0], cmd_op); 459 + sqe[0]->fd = 0; /* dev->fds[0] */ 460 + sqe[0]->opcode = IORING_OP_URING_CMD; 461 + sqe[0]->flags = IOSQE_FIXED_FILE; 462 + sqe[0]->rw_flags = 0; 463 + cmd->tag = tag; 464 + cmd->q_id = q->q_id; 465 + if (!(q->state & UBLKSRV_NO_BUF)) 466 + cmd->addr = (__u64) (uintptr_t) io->buf_addr; 467 + else 468 + cmd->addr = 0; 469 + 470 + user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); 471 + io_uring_sqe_set_data64(sqe[0], user_data); 472 + 473 + io->flags = 0; 474 + 475 + q->cmd_inflight += 1; 476 + 477 + ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n", 478 + __func__, q->q_id, tag, cmd_op, 479 + io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING)); 480 + return 1; 481 + } 482 + 483 + static void ublk_submit_fetch_commands(struct ublk_queue *q) 484 + { 485 + int i = 0; 486 + 487 + for (i = 0; i < q->q_depth; i++) 488 + ublk_queue_io_cmd(q, &q->ios[i], i); 489 + } 490 + 491 + static int ublk_queue_is_idle(struct ublk_queue *q) 492 + { 493 + return !io_uring_sq_ready(&q->ring) && !q->io_inflight; 494 + } 495 + 496 + static int ublk_queue_is_done(struct ublk_queue *q) 497 + { 498 + return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q); 499 + } 500 + 501 + static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, 502 + struct io_uring_cqe *cqe) 503 + { 504 + unsigned tag = user_data_to_tag(cqe->user_data); 505 + 506 + if (cqe->res < 0 && cqe->res != -EAGAIN) 507 + ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", 508 + __func__, cqe->res, q->q_id, 509 + user_data_to_tag(cqe->user_data), 510 + user_data_to_op(cqe->user_data)); 511 + 512 + if (q->tgt_ops->tgt_io_done) 513 + q->tgt_ops->tgt_io_done(q, tag, cqe); 514 + } 515 + 516 + static void ublk_handle_cqe(struct io_uring *r, 517 + struct io_uring_cqe *cqe, void *data) 518 + { 519 + struct ublk_queue *q = container_of(r, struct ublk_queue, ring); 520 + unsigned tag = user_data_to_tag(cqe->user_data); 521 + unsigned cmd_op = user_data_to_op(cqe->user_data); 522 + int fetch = (cqe->res != UBLK_IO_RES_ABORT) && 523 + !(q->state & UBLKSRV_QUEUE_STOPPING); 524 + struct ublk_io *io; 525 + 526 + if (cqe->res < 0 && cqe->res != -ENODEV) 527 + ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, 528 + cqe->res, cqe->user_data, q->state); 529 + 530 + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", 531 + __func__, cqe->res, q->q_id, tag, cmd_op, 532 + is_target_io(cqe->user_data), 533 + user_data_to_tgt_data(cqe->user_data), 534 + (q->state & UBLKSRV_QUEUE_STOPPING)); 535 + 536 + /* Don't retrieve io in case of target io */ 537 + if (is_target_io(cqe->user_data)) { 538 + ublksrv_handle_tgt_cqe(q, cqe); 539 + return; 540 + } 541 + 542 + io = &q->ios[tag]; 543 + q->cmd_inflight--; 544 + 545 + if (!fetch) { 546 + q->state |= UBLKSRV_QUEUE_STOPPING; 547 + io->flags &= ~UBLKSRV_NEED_FETCH_RQ; 548 + } 549 + 550 + if (cqe->res == UBLK_IO_RES_OK) { 551 + assert(tag < q->q_depth); 552 + if (q->tgt_ops->queue_io) 553 + q->tgt_ops->queue_io(q, tag); 554 + } else { 555 + /* 556 + * COMMIT_REQ will be completed immediately since no fetching 557 + * piggyback is required. 558 + * 559 + * Marking IO_FREE only, then this io won't be issued since 560 + * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) 561 + * 562 + * */ 563 + io->flags = UBLKSRV_IO_FREE; 564 + } 565 + } 566 + 567 + static int ublk_reap_events_uring(struct io_uring *r) 568 + { 569 + struct io_uring_cqe *cqe; 570 + unsigned head; 571 + int count = 0; 572 + 573 + io_uring_for_each_cqe(r, head, cqe) { 574 + ublk_handle_cqe(r, cqe, NULL); 575 + count += 1; 576 + } 577 + io_uring_cq_advance(r, count); 578 + 579 + return count; 580 + } 581 + 582 + static int ublk_process_io(struct ublk_queue *q) 583 + { 584 + int ret, reapped; 585 + 586 + ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n", 587 + q->dev->dev_info.dev_id, 588 + q->q_id, io_uring_sq_ready(&q->ring), 589 + q->cmd_inflight, 590 + (q->state & UBLKSRV_QUEUE_STOPPING)); 591 + 592 + if (ublk_queue_is_done(q)) 593 + return -ENODEV; 594 + 595 + ret = io_uring_submit_and_wait(&q->ring, 1); 596 + reapped = ublk_reap_events_uring(&q->ring); 597 + 598 + ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n", 599 + ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING), 600 + (q->state & UBLKSRV_QUEUE_IDLE)); 601 + 602 + return reapped; 603 + } 604 + 605 + static void *ublk_io_handler_fn(void *data) 606 + { 607 + struct ublk_queue *q = data; 608 + int dev_id = q->dev->dev_info.dev_id; 609 + int ret; 610 + 611 + ret = ublk_queue_init(q); 612 + if (ret) { 613 + ublk_err("ublk dev %d queue %d init queue failed\n", 614 + dev_id, q->q_id); 615 + return NULL; 616 + } 617 + ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n", 618 + q->tid, dev_id, q->q_id); 619 + 620 + /* submit all io commands to ublk driver */ 621 + ublk_submit_fetch_commands(q); 622 + do { 623 + if (ublk_process_io(q) < 0) 624 + break; 625 + } while (1); 626 + 627 + ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id); 628 + ublk_queue_deinit(q); 629 + return NULL; 630 + } 631 + 632 + static void ublk_set_parameters(struct ublk_dev *dev) 633 + { 634 + int ret; 635 + 636 + ret = ublk_ctrl_set_params(dev, &dev->tgt.params); 637 + if (ret) 638 + ublk_err("dev %d set basic parameter failed %d\n", 639 + dev->dev_info.dev_id, ret); 640 + } 641 + 642 + static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) 643 + { 644 + uint64_t id; 645 + int evtfd = ctx->_evtfd; 646 + 647 + if (evtfd < 0) 648 + return -EBADF; 649 + 650 + if (dev_id >= 0) 651 + id = dev_id + 1; 652 + else 653 + id = ERROR_EVTFD_DEVID; 654 + 655 + if (write(evtfd, &id, sizeof(id)) != sizeof(id)) 656 + return -EINVAL; 657 + 658 + return 0; 659 + } 660 + 661 + 662 + static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) 663 + { 664 + int ret, i; 665 + void *thread_ret; 666 + const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; 667 + 668 + ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); 669 + 670 + ret = ublk_dev_prep(ctx, dev); 671 + if (ret) 672 + return ret; 673 + 674 + for (i = 0; i < dinfo->nr_hw_queues; i++) { 675 + dev->q[i].dev = dev; 676 + dev->q[i].q_id = i; 677 + pthread_create(&dev->q[i].thread, NULL, 678 + ublk_io_handler_fn, 679 + &dev->q[i]); 680 + } 681 + 682 + /* everything is fine now, start us */ 683 + ublk_set_parameters(dev); 684 + ret = ublk_ctrl_start_dev(dev, getpid()); 685 + if (ret < 0) { 686 + ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); 687 + goto fail; 688 + } 689 + 690 + ublk_ctrl_get_info(dev); 691 + if (ctx->fg) 692 + ublk_ctrl_dump(dev); 693 + else 694 + ublk_send_dev_event(ctx, dev->dev_info.dev_id); 695 + 696 + /* wait until we are terminated */ 697 + for (i = 0; i < dinfo->nr_hw_queues; i++) 698 + pthread_join(dev->q[i].thread, &thread_ret); 699 + fail: 700 + ublk_dev_unprep(dev); 701 + ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); 702 + 703 + return ret; 704 + } 705 + 706 + static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout) 707 + { 708 + #define EV_SIZE (sizeof(struct inotify_event)) 709 + #define EV_BUF_LEN (128 * (EV_SIZE + 16)) 710 + struct pollfd pfd; 711 + int fd, wd; 712 + int ret = -EINVAL; 713 + const char *dev_name = basename(path); 714 + 715 + fd = inotify_init(); 716 + if (fd < 0) { 717 + ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); 718 + return fd; 719 + } 720 + 721 + wd = inotify_add_watch(fd, "/dev", evt_mask); 722 + if (wd == -1) { 723 + ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); 724 + goto fail; 725 + } 726 + 727 + pfd.fd = fd; 728 + pfd.events = POLL_IN; 729 + while (1) { 730 + int i = 0; 731 + char buffer[EV_BUF_LEN]; 732 + ret = poll(&pfd, 1, 1000 * timeout); 733 + 734 + if (ret == -1) { 735 + ublk_err("%s: poll inotify failed: %d\n", __func__, ret); 736 + goto rm_watch; 737 + } else if (ret == 0) { 738 + ublk_err("%s: poll inotify timeout\n", __func__); 739 + ret = -ETIMEDOUT; 740 + goto rm_watch; 741 + } 742 + 743 + ret = read(fd, buffer, EV_BUF_LEN); 744 + if (ret < 0) { 745 + ublk_err("%s: read inotify fd failed\n", __func__); 746 + goto rm_watch; 747 + } 748 + 749 + while (i < ret) { 750 + struct inotify_event *event = (struct inotify_event *)&buffer[i]; 751 + 752 + ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", 753 + __func__, event->mask, event->name); 754 + if (event->mask & evt_mask) { 755 + if (!strcmp(event->name, dev_name)) { 756 + ret = 0; 757 + goto rm_watch; 758 + } 759 + } 760 + i += EV_SIZE + event->len; 761 + } 762 + } 763 + rm_watch: 764 + inotify_rm_watch(fd, wd); 765 + fail: 766 + close(fd); 767 + return ret; 768 + } 769 + 770 + static int ublk_stop_io_daemon(const struct ublk_dev *dev) 771 + { 772 + int daemon_pid = dev->dev_info.ublksrv_pid; 773 + int dev_id = dev->dev_info.dev_id; 774 + char ublkc[64]; 775 + int ret = 0; 776 + 777 + if (daemon_pid < 0) 778 + return 0; 779 + 780 + /* daemon may be dead already */ 781 + if (kill(daemon_pid, 0) < 0) 782 + goto wait; 783 + 784 + snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id); 785 + 786 + /* ublk char device may be gone already */ 787 + if (access(ublkc, F_OK) != 0) 788 + goto wait; 789 + 790 + /* Wait until ublk char device is closed, when the daemon is shutdown */ 791 + ret = wait_ublk_dev(ublkc, IN_CLOSE, 10); 792 + /* double check and since it may be closed before starting inotify */ 793 + if (ret == -ETIMEDOUT) 794 + ret = kill(daemon_pid, 0) < 0; 795 + wait: 796 + waitpid(daemon_pid, NULL, 0); 797 + ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", 798 + __func__, daemon_pid, dev_id, ret); 799 + 800 + return ret; 801 + } 802 + 803 + static int __cmd_dev_add(const struct dev_ctx *ctx) 804 + { 805 + unsigned nr_queues = ctx->nr_hw_queues; 806 + const char *tgt_type = ctx->tgt_type; 807 + unsigned depth = ctx->queue_depth; 808 + __u64 features; 809 + const struct ublk_tgt_ops *ops; 810 + struct ublksrv_ctrl_dev_info *info; 811 + struct ublk_dev *dev; 812 + int dev_id = ctx->dev_id; 813 + int ret, i; 814 + 815 + ops = ublk_find_tgt(tgt_type); 816 + if (!ops) { 817 + ublk_err("%s: no such tgt type, type %s\n", 818 + __func__, tgt_type); 819 + return -ENODEV; 820 + } 821 + 822 + if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { 823 + ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", 824 + __func__, nr_queues, depth); 825 + return -EINVAL; 826 + } 827 + 828 + dev = ublk_ctrl_init(); 829 + if (!dev) { 830 + ublk_err("%s: can't alloc dev id %d, type %s\n", 831 + __func__, dev_id, tgt_type); 832 + return -ENOMEM; 833 + } 834 + 835 + /* kernel doesn't support get_features */ 836 + ret = ublk_ctrl_get_features(dev, &features); 837 + if (ret < 0) 838 + return -EINVAL; 839 + 840 + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) 841 + return -ENOTSUP; 842 + 843 + info = &dev->dev_info; 844 + info->dev_id = ctx->dev_id; 845 + info->nr_hw_queues = nr_queues; 846 + info->queue_depth = depth; 847 + info->flags = ctx->flags; 848 + dev->tgt.ops = ops; 849 + dev->tgt.sq_depth = depth; 850 + dev->tgt.cq_depth = depth; 851 + 852 + for (i = 0; i < MAX_BACK_FILES; i++) { 853 + if (ctx->files[i]) { 854 + strcpy(dev->tgt.backing_file[i], ctx->files[i]); 855 + dev->tgt.nr_backing_files++; 856 + } 857 + } 858 + 859 + ret = ublk_ctrl_add_dev(dev); 860 + if (ret < 0) { 861 + ublk_err("%s: can't add dev id %d, type %s ret %d\n", 862 + __func__, dev_id, tgt_type, ret); 863 + goto fail; 864 + } 865 + 866 + ret = ublk_start_daemon(ctx, dev); 867 + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); 868 + if (ret < 0) 869 + ublk_ctrl_del_dev(dev); 870 + 871 + fail: 872 + if (ret < 0) 873 + ublk_send_dev_event(ctx, -1); 874 + ublk_ctrl_deinit(dev); 875 + return ret; 876 + } 877 + 878 + static int __cmd_dev_list(struct dev_ctx *ctx); 879 + 880 + static int cmd_dev_add(struct dev_ctx *ctx) 881 + { 882 + int res; 883 + 884 + if (ctx->fg) 885 + goto run; 886 + 887 + ctx->_evtfd = eventfd(0, 0); 888 + if (ctx->_evtfd < 0) { 889 + ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); 890 + exit(-1); 891 + } 892 + 893 + setsid(); 894 + res = fork(); 895 + if (res == 0) { 896 + run: 897 + res = __cmd_dev_add(ctx); 898 + return res; 899 + } else if (res > 0) { 900 + uint64_t id; 901 + 902 + res = read(ctx->_evtfd, &id, sizeof(id)); 903 + close(ctx->_evtfd); 904 + if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { 905 + ctx->dev_id = id - 1; 906 + return __cmd_dev_list(ctx); 907 + } 908 + exit(EXIT_FAILURE); 909 + } else { 910 + return res; 911 + } 912 + } 913 + 914 + static int __cmd_dev_del(struct dev_ctx *ctx) 915 + { 916 + int number = ctx->dev_id; 917 + struct ublk_dev *dev; 918 + int ret; 919 + 920 + dev = ublk_ctrl_init(); 921 + dev->dev_info.dev_id = number; 922 + 923 + ret = ublk_ctrl_get_info(dev); 924 + if (ret < 0) 925 + goto fail; 926 + 927 + ret = ublk_ctrl_stop_dev(dev); 928 + if (ret < 0) 929 + ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); 930 + 931 + ret = ublk_stop_io_daemon(dev); 932 + if (ret < 0) 933 + ublk_err("%s: stop daemon id %d dev %d, ret %d\n", 934 + __func__, dev->dev_info.ublksrv_pid, number, ret); 935 + ublk_ctrl_del_dev(dev); 936 + fail: 937 + ublk_ctrl_deinit(dev); 938 + 939 + return (ret >= 0) ? 0 : ret; 940 + } 941 + 942 + static int cmd_dev_del(struct dev_ctx *ctx) 943 + { 944 + int i; 945 + 946 + if (ctx->dev_id >= 0 || !ctx->all) 947 + return __cmd_dev_del(ctx); 948 + 949 + for (i = 0; i < 255; i++) { 950 + ctx->dev_id = i; 951 + __cmd_dev_del(ctx); 952 + } 953 + return 0; 954 + } 955 + 956 + static int __cmd_dev_list(struct dev_ctx *ctx) 957 + { 958 + struct ublk_dev *dev = ublk_ctrl_init(); 959 + int ret; 960 + 961 + if (!dev) 962 + return -ENODEV; 963 + 964 + dev->dev_info.dev_id = ctx->dev_id; 965 + 966 + ret = ublk_ctrl_get_info(dev); 967 + if (ret < 0) { 968 + if (ctx->logging) 969 + ublk_err("%s: can't get dev info from %d: %d\n", 970 + __func__, ctx->dev_id, ret); 971 + } else { 972 + ublk_ctrl_dump(dev); 973 + } 974 + 975 + ublk_ctrl_deinit(dev); 976 + 977 + return ret; 978 + } 979 + 980 + static int cmd_dev_list(struct dev_ctx *ctx) 981 + { 982 + int i; 983 + 984 + if (ctx->dev_id >= 0 || !ctx->all) 985 + return __cmd_dev_list(ctx); 986 + 987 + ctx->logging = false; 988 + for (i = 0; i < 255; i++) { 989 + ctx->dev_id = i; 990 + __cmd_dev_list(ctx); 991 + } 992 + return 0; 993 + } 994 + 995 + static int cmd_dev_get_features(void) 996 + { 997 + #define const_ilog2(x) (63 - __builtin_clzll(x)) 998 + static const char *feat_map[] = { 999 + [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", 1000 + [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", 1001 + [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", 1002 + [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", 1003 + [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", 1004 + [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", 1005 + [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", 1006 + [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", 1007 + [const_ilog2(UBLK_F_ZONED)] = "ZONED", 1008 + [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", 1009 + }; 1010 + struct ublk_dev *dev; 1011 + __u64 features = 0; 1012 + int ret; 1013 + 1014 + dev = ublk_ctrl_init(); 1015 + if (!dev) { 1016 + fprintf(stderr, "ublksrv_ctrl_init failed id\n"); 1017 + return -EOPNOTSUPP; 1018 + } 1019 + 1020 + ret = ublk_ctrl_get_features(dev, &features); 1021 + if (!ret) { 1022 + int i; 1023 + 1024 + printf("ublk_drv features: 0x%llx\n", features); 1025 + 1026 + for (i = 0; i < sizeof(features) * 8; i++) { 1027 + const char *feat; 1028 + 1029 + if (!((1ULL << i) & features)) 1030 + continue; 1031 + if (i < sizeof(feat_map) / sizeof(feat_map[0])) 1032 + feat = feat_map[i]; 1033 + else 1034 + feat = "unknown"; 1035 + printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); 1036 + } 1037 + } 1038 + 1039 + return ret; 1040 + } 1041 + 1042 + static int cmd_dev_help(char *exe) 1043 + { 1044 + printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe); 1045 + printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); 1046 + printf("%s del [-n dev_id] -a \n", exe); 1047 + printf("\t -a delete all devices -n delete specified device\n"); 1048 + printf("%s list [-n dev_id] -a \n", exe); 1049 + printf("\t -a list all devices, -n list specified device, default -a \n"); 1050 + printf("%s features\n", exe); 1051 + return 0; 1052 + } 1053 + 1054 + int main(int argc, char *argv[]) 1055 + { 1056 + static const struct option longopts[] = { 1057 + { "all", 0, NULL, 'a' }, 1058 + { "type", 1, NULL, 't' }, 1059 + { "number", 1, NULL, 'n' }, 1060 + { "queues", 1, NULL, 'q' }, 1061 + { "depth", 1, NULL, 'd' }, 1062 + { "debug_mask", 1, NULL, 0 }, 1063 + { "quiet", 0, NULL, 0 }, 1064 + { "zero_copy", 0, NULL, 'z' }, 1065 + { "foreground", 0, NULL, 0 }, 1066 + { "chunk_size", 1, NULL, 0 }, 1067 + { 0, 0, 0, 0 } 1068 + }; 1069 + int option_idx, opt; 1070 + const char *cmd = argv[1]; 1071 + struct dev_ctx ctx = { 1072 + .queue_depth = 128, 1073 + .nr_hw_queues = 2, 1074 + .dev_id = -1, 1075 + .tgt_type = "unknown", 1076 + .chunk_size = 65536, /* def chunk size is 64K */ 1077 + }; 1078 + int ret = -EINVAL, i; 1079 + 1080 + if (argc == 1) 1081 + return ret; 1082 + 1083 + optind = 2; 1084 + while ((opt = getopt_long(argc, argv, "t:n:d:q:az", 1085 + longopts, &option_idx)) != -1) { 1086 + switch (opt) { 1087 + case 'a': 1088 + ctx.all = 1; 1089 + break; 1090 + case 'n': 1091 + ctx.dev_id = strtol(optarg, NULL, 10); 1092 + break; 1093 + case 't': 1094 + if (strlen(optarg) < sizeof(ctx.tgt_type)) 1095 + strcpy(ctx.tgt_type, optarg); 1096 + break; 1097 + case 'q': 1098 + ctx.nr_hw_queues = strtol(optarg, NULL, 10); 1099 + break; 1100 + case 'd': 1101 + ctx.queue_depth = strtol(optarg, NULL, 10); 1102 + break; 1103 + case 'z': 1104 + ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; 1105 + break; 1106 + case 0: 1107 + if (!strcmp(longopts[option_idx].name, "debug_mask")) 1108 + ublk_dbg_mask = strtol(optarg, NULL, 16); 1109 + if (!strcmp(longopts[option_idx].name, "quiet")) 1110 + ublk_dbg_mask = 0; 1111 + if (!strcmp(longopts[option_idx].name, "foreground")) 1112 + ctx.fg = 1; 1113 + if (!strcmp(longopts[option_idx].name, "chunk_size")) 1114 + ctx.chunk_size = strtol(optarg, NULL, 10); 1115 + } 1116 + } 1117 + 1118 + i = optind; 1119 + while (i < argc && ctx.nr_files < MAX_BACK_FILES) { 1120 + ctx.files[ctx.nr_files++] = argv[i++]; 1121 + } 1122 + 1123 + if (!strcmp(cmd, "add")) 1124 + ret = cmd_dev_add(&ctx); 1125 + else if (!strcmp(cmd, "del")) 1126 + ret = cmd_dev_del(&ctx); 1127 + else if (!strcmp(cmd, "list")) { 1128 + ctx.all = 1; 1129 + ret = cmd_dev_list(&ctx); 1130 + } else if (!strcmp(cmd, "help")) 1131 + ret = cmd_dev_help(argv[0]); 1132 + else if (!strcmp(cmd, "features")) 1133 + ret = cmd_dev_get_features(); 1134 + else 1135 + cmd_dev_help(argv[0]); 1136 + 1137 + return ret; 1138 + }
+370
tools/testing/selftests/ublk/kublk.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef KUBLK_INTERNAL_H 3 + #define KUBLK_INTERNAL_H 4 + 5 + #include <unistd.h> 6 + #include <stdlib.h> 7 + #include <assert.h> 8 + #include <stdio.h> 9 + #include <stdarg.h> 10 + #include <string.h> 11 + #include <pthread.h> 12 + #include <getopt.h> 13 + #include <limits.h> 14 + #include <poll.h> 15 + #include <fcntl.h> 16 + #include <sys/syscall.h> 17 + #include <sys/mman.h> 18 + #include <sys/ioctl.h> 19 + #include <sys/inotify.h> 20 + #include <sys/wait.h> 21 + #include <sys/eventfd.h> 22 + #include <sys/uio.h> 23 + #include <liburing.h> 24 + #include <linux/ublk_cmd.h> 25 + #include "ublk_dep.h" 26 + 27 + #define __maybe_unused __attribute__((unused)) 28 + #define MAX_BACK_FILES 4 29 + #ifndef min 30 + #define min(a, b) ((a) < (b) ? (a) : (b)) 31 + #endif 32 + 33 + /****************** part 1: libublk ********************/ 34 + 35 + #define CTRL_DEV "/dev/ublk-control" 36 + #define UBLKC_DEV "/dev/ublkc" 37 + #define UBLKB_DEV "/dev/ublkb" 38 + #define UBLK_CTRL_RING_DEPTH 32 39 + #define ERROR_EVTFD_DEVID -2 40 + 41 + /* queue idle timeout */ 42 + #define UBLKSRV_IO_IDLE_SECS 20 43 + 44 + #define UBLK_IO_MAX_BYTES (1 << 20) 45 + #define UBLK_MAX_QUEUES 4 46 + #define UBLK_QUEUE_DEPTH 128 47 + 48 + #define UBLK_DBG_DEV (1U << 0) 49 + #define UBLK_DBG_QUEUE (1U << 1) 50 + #define UBLK_DBG_IO_CMD (1U << 2) 51 + #define UBLK_DBG_IO (1U << 3) 52 + #define UBLK_DBG_CTRL_CMD (1U << 4) 53 + #define UBLK_LOG (1U << 5) 54 + 55 + struct ublk_dev; 56 + struct ublk_queue; 57 + 58 + struct dev_ctx { 59 + char tgt_type[16]; 60 + unsigned long flags; 61 + unsigned nr_hw_queues; 62 + unsigned queue_depth; 63 + int dev_id; 64 + int nr_files; 65 + char *files[MAX_BACK_FILES]; 66 + unsigned int logging:1; 67 + unsigned int all:1; 68 + unsigned int fg:1; 69 + 70 + /* stripe */ 71 + unsigned int chunk_size; 72 + 73 + int _evtfd; 74 + }; 75 + 76 + struct ublk_ctrl_cmd_data { 77 + __u32 cmd_op; 78 + #define CTRL_CMD_HAS_DATA 1 79 + #define CTRL_CMD_HAS_BUF 2 80 + __u32 flags; 81 + 82 + __u64 data[2]; 83 + __u64 addr; 84 + __u32 len; 85 + }; 86 + 87 + struct ublk_io { 88 + char *buf_addr; 89 + 90 + #define UBLKSRV_NEED_FETCH_RQ (1UL << 0) 91 + #define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1) 92 + #define UBLKSRV_IO_FREE (1UL << 2) 93 + unsigned short flags; 94 + unsigned short refs; /* used by target code only */ 95 + 96 + int result; 97 + 98 + unsigned short tgt_ios; 99 + void *private_data; 100 + }; 101 + 102 + struct ublk_tgt_ops { 103 + const char *name; 104 + int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); 105 + void (*deinit_tgt)(struct ublk_dev *); 106 + 107 + int (*queue_io)(struct ublk_queue *, int tag); 108 + void (*tgt_io_done)(struct ublk_queue *, 109 + int tag, const struct io_uring_cqe *); 110 + }; 111 + 112 + struct ublk_tgt { 113 + unsigned long dev_size; 114 + unsigned int sq_depth; 115 + unsigned int cq_depth; 116 + const struct ublk_tgt_ops *ops; 117 + struct ublk_params params; 118 + 119 + int nr_backing_files; 120 + unsigned long backing_file_size[MAX_BACK_FILES]; 121 + char backing_file[MAX_BACK_FILES][PATH_MAX]; 122 + }; 123 + 124 + struct ublk_queue { 125 + int q_id; 126 + int q_depth; 127 + unsigned int cmd_inflight; 128 + unsigned int io_inflight; 129 + struct ublk_dev *dev; 130 + const struct ublk_tgt_ops *tgt_ops; 131 + char *io_cmd_buf; 132 + struct io_uring ring; 133 + struct ublk_io ios[UBLK_QUEUE_DEPTH]; 134 + #define UBLKSRV_QUEUE_STOPPING (1U << 0) 135 + #define UBLKSRV_QUEUE_IDLE (1U << 1) 136 + #define UBLKSRV_NO_BUF (1U << 2) 137 + #define UBLKSRV_ZC (1U << 3) 138 + unsigned state; 139 + pid_t tid; 140 + pthread_t thread; 141 + }; 142 + 143 + struct ublk_dev { 144 + struct ublk_tgt tgt; 145 + struct ublksrv_ctrl_dev_info dev_info; 146 + struct ublk_queue q[UBLK_MAX_QUEUES]; 147 + 148 + int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ 149 + int nr_fds; 150 + int ctrl_fd; 151 + struct io_uring ring; 152 + 153 + void *private_data; 154 + }; 155 + 156 + #ifndef offsetof 157 + #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) 158 + #endif 159 + 160 + #ifndef container_of 161 + #define container_of(ptr, type, member) ({ \ 162 + unsigned long __mptr = (unsigned long)(ptr); \ 163 + ((type *)(__mptr - offsetof(type, member))); }) 164 + #endif 165 + 166 + #define round_up(val, rnd) \ 167 + (((val) + ((rnd) - 1)) & ~((rnd) - 1)) 168 + 169 + 170 + extern unsigned int ublk_dbg_mask; 171 + extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag); 172 + 173 + static inline int is_target_io(__u64 user_data) 174 + { 175 + return (user_data & (1ULL << 63)) != 0; 176 + } 177 + 178 + static inline __u64 build_user_data(unsigned tag, unsigned op, 179 + unsigned tgt_data, unsigned is_target_io) 180 + { 181 + assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16)); 182 + 183 + return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63; 184 + } 185 + 186 + static inline unsigned int user_data_to_tag(__u64 user_data) 187 + { 188 + return user_data & 0xffff; 189 + } 190 + 191 + static inline unsigned int user_data_to_op(__u64 user_data) 192 + { 193 + return (user_data >> 16) & 0xff; 194 + } 195 + 196 + static inline unsigned int user_data_to_tgt_data(__u64 user_data) 197 + { 198 + return (user_data >> 24) & 0xffff; 199 + } 200 + 201 + static inline unsigned short ublk_cmd_op_nr(unsigned int op) 202 + { 203 + return _IOC_NR(op); 204 + } 205 + 206 + static inline void ublk_err(const char *fmt, ...) 207 + { 208 + va_list ap; 209 + 210 + va_start(ap, fmt); 211 + vfprintf(stderr, fmt, ap); 212 + } 213 + 214 + static inline void ublk_log(const char *fmt, ...) 215 + { 216 + if (ublk_dbg_mask & UBLK_LOG) { 217 + va_list ap; 218 + 219 + va_start(ap, fmt); 220 + vfprintf(stdout, fmt, ap); 221 + } 222 + } 223 + 224 + static inline void ublk_dbg(int level, const char *fmt, ...) 225 + { 226 + if (level & ublk_dbg_mask) { 227 + va_list ap; 228 + 229 + va_start(ap, fmt); 230 + vfprintf(stdout, fmt, ap); 231 + } 232 + } 233 + 234 + static inline int ublk_queue_alloc_sqes(struct ublk_queue *q, 235 + struct io_uring_sqe *sqes[], int nr_sqes) 236 + { 237 + unsigned left = io_uring_sq_space_left(&q->ring); 238 + int i; 239 + 240 + if (left < nr_sqes) 241 + io_uring_submit(&q->ring); 242 + 243 + for (i = 0; i < nr_sqes; i++) { 244 + sqes[i] = io_uring_get_sqe(&q->ring); 245 + if (!sqes[i]) 246 + return i; 247 + } 248 + 249 + return nr_sqes; 250 + } 251 + 252 + static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, 253 + int dev_fd, int tag, int q_id, __u64 index) 254 + { 255 + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; 256 + 257 + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); 258 + sqe->opcode = IORING_OP_URING_CMD; 259 + sqe->flags |= IOSQE_FIXED_FILE; 260 + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; 261 + 262 + cmd->tag = tag; 263 + cmd->addr = index; 264 + cmd->q_id = q_id; 265 + } 266 + 267 + static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, 268 + int dev_fd, int tag, int q_id, __u64 index) 269 + { 270 + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; 271 + 272 + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); 273 + sqe->opcode = IORING_OP_URING_CMD; 274 + sqe->flags |= IOSQE_FIXED_FILE; 275 + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; 276 + 277 + cmd->tag = tag; 278 + cmd->addr = index; 279 + cmd->q_id = q_id; 280 + } 281 + 282 + static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) 283 + { 284 + return (void *)&sqe->cmd; 285 + } 286 + 287 + static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res) 288 + { 289 + q->ios[tag].result = res; 290 + } 291 + 292 + static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) 293 + { 294 + return q->ios[tag].result; 295 + } 296 + 297 + static inline void ublk_mark_io_done(struct ublk_io *io, int res) 298 + { 299 + io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); 300 + io->result = res; 301 + } 302 + 303 + static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) 304 + { 305 + return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); 306 + } 307 + 308 + static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) 309 + { 310 + __u32 *addr = (__u32 *)&sqe->off; 311 + 312 + addr[0] = cmd_op; 313 + addr[1] = 0; 314 + } 315 + 316 + static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) 317 + { 318 + return &q->ios[tag]; 319 + } 320 + 321 + static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) 322 + { 323 + struct ublk_io *io = &q->ios[tag]; 324 + 325 + ublk_mark_io_done(io, res); 326 + 327 + return ublk_queue_io_cmd(q, io, tag); 328 + } 329 + 330 + static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued) 331 + { 332 + if (queued < 0) 333 + ublk_complete_io(q, tag, queued); 334 + else { 335 + struct ublk_io *io = ublk_get_io(q, tag); 336 + 337 + q->io_inflight += queued; 338 + io->tgt_ios = queued; 339 + io->result = 0; 340 + } 341 + } 342 + 343 + static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag) 344 + { 345 + struct ublk_io *io = ublk_get_io(q, tag); 346 + 347 + q->io_inflight--; 348 + 349 + return --io->tgt_ios == 0; 350 + } 351 + 352 + static inline int ublk_queue_use_zc(const struct ublk_queue *q) 353 + { 354 + return q->state & UBLKSRV_ZC; 355 + } 356 + 357 + extern const struct ublk_tgt_ops null_tgt_ops; 358 + extern const struct ublk_tgt_ops loop_tgt_ops; 359 + extern const struct ublk_tgt_ops stripe_tgt_ops; 360 + 361 + void backing_file_tgt_deinit(struct ublk_dev *dev); 362 + int backing_file_tgt_init(struct ublk_dev *dev); 363 + 364 + static inline unsigned int ilog2(unsigned int x) 365 + { 366 + if (x == 0) 367 + return 0; 368 + return (sizeof(x) * 8 - 1) - __builtin_clz(x); 369 + } 370 + #endif
+106
tools/testing/selftests/ublk/null.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #include "kublk.h" 4 + 5 + #ifndef IORING_NOP_INJECT_RESULT 6 + #define IORING_NOP_INJECT_RESULT (1U << 0) 7 + #endif 8 + 9 + #ifndef IORING_NOP_FIXED_BUFFER 10 + #define IORING_NOP_FIXED_BUFFER (1U << 3) 11 + #endif 12 + 13 + static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) 14 + { 15 + const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 16 + unsigned long dev_size = 250UL << 30; 17 + 18 + dev->tgt.dev_size = dev_size; 19 + dev->tgt.params = (struct ublk_params) { 20 + .types = UBLK_PARAM_TYPE_BASIC, 21 + .basic = { 22 + .logical_bs_shift = 9, 23 + .physical_bs_shift = 12, 24 + .io_opt_shift = 12, 25 + .io_min_shift = 9, 26 + .max_sectors = info->max_io_buf_bytes >> 9, 27 + .dev_sectors = dev_size >> 9, 28 + }, 29 + }; 30 + 31 + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) 32 + dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; 33 + return 0; 34 + } 35 + 36 + static int null_queue_zc_io(struct ublk_queue *q, int tag) 37 + { 38 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 39 + unsigned ublk_op = ublksrv_get_op(iod); 40 + struct io_uring_sqe *sqe[3]; 41 + 42 + ublk_queue_alloc_sqes(q, sqe, 3); 43 + 44 + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); 45 + sqe[0]->user_data = build_user_data(tag, 46 + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); 47 + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; 48 + 49 + io_uring_prep_nop(sqe[1]); 50 + sqe[1]->buf_index = tag; 51 + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; 52 + sqe[1]->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; 53 + sqe[1]->len = iod->nr_sectors << 9; /* injected result */ 54 + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); 55 + 56 + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); 57 + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); 58 + 59 + // buf register is marked as IOSQE_CQE_SKIP_SUCCESS 60 + return 2; 61 + } 62 + 63 + static void ublk_null_io_done(struct ublk_queue *q, int tag, 64 + const struct io_uring_cqe *cqe) 65 + { 66 + unsigned op = user_data_to_op(cqe->user_data); 67 + struct ublk_io *io = ublk_get_io(q, tag); 68 + 69 + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { 70 + if (!io->result) 71 + io->result = cqe->res; 72 + if (cqe->res < 0) 73 + ublk_err("%s: io failed op %x user_data %lx\n", 74 + __func__, op, cqe->user_data); 75 + } 76 + 77 + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ 78 + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) 79 + io->tgt_ios += 1; 80 + 81 + if (ublk_completed_tgt_io(q, tag)) 82 + ublk_complete_io(q, tag, io->result); 83 + } 84 + 85 + static int ublk_null_queue_io(struct ublk_queue *q, int tag) 86 + { 87 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 88 + int zc = ublk_queue_use_zc(q); 89 + int queued; 90 + 91 + if (!zc) { 92 + ublk_complete_io(q, tag, iod->nr_sectors << 9); 93 + return 0; 94 + } 95 + 96 + queued = null_queue_zc_io(q, tag); 97 + ublk_queued_tgt_io(q, tag, queued); 98 + return 0; 99 + } 100 + 101 + const struct ublk_tgt_ops null_tgt_ops = { 102 + .name = "null", 103 + .init_tgt = ublk_null_tgt_init, 104 + .queue_io = ublk_null_queue_io, 105 + .tgt_io_done = ublk_null_io_done, 106 + };
+318
tools/testing/selftests/ublk/stripe.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "kublk.h" 4 + 5 + #define NR_STRIPE MAX_BACK_FILES 6 + 7 + struct stripe_conf { 8 + unsigned nr_files; 9 + unsigned shift; 10 + }; 11 + 12 + struct stripe { 13 + loff_t start; 14 + unsigned nr_sects; 15 + int seq; 16 + 17 + struct iovec *vec; 18 + unsigned nr_vec; 19 + unsigned cap; 20 + }; 21 + 22 + struct stripe_array { 23 + struct stripe s[NR_STRIPE]; 24 + unsigned nr; 25 + struct iovec _vec[]; 26 + }; 27 + 28 + static inline const struct stripe_conf *get_chunk_shift(const struct ublk_queue *q) 29 + { 30 + return (struct stripe_conf *)q->dev->private_data; 31 + } 32 + 33 + static inline unsigned calculate_nr_vec(const struct stripe_conf *conf, 34 + const struct ublksrv_io_desc *iod) 35 + { 36 + const unsigned shift = conf->shift - 9; 37 + const unsigned unit_sects = conf->nr_files << shift; 38 + loff_t start = iod->start_sector; 39 + loff_t end = start + iod->nr_sectors; 40 + 41 + return (end / unit_sects) - (start / unit_sects) + 1; 42 + } 43 + 44 + static struct stripe_array *alloc_stripe_array(const struct stripe_conf *conf, 45 + const struct ublksrv_io_desc *iod) 46 + { 47 + unsigned nr_vecs = calculate_nr_vec(conf, iod); 48 + unsigned total = nr_vecs * conf->nr_files; 49 + struct stripe_array *s; 50 + int i; 51 + 52 + s = malloc(sizeof(*s) + total * sizeof(struct iovec)); 53 + 54 + s->nr = 0; 55 + for (i = 0; i < conf->nr_files; i++) { 56 + struct stripe *t = &s->s[i]; 57 + 58 + t->nr_vec = 0; 59 + t->vec = &s->_vec[i * nr_vecs]; 60 + t->nr_sects = 0; 61 + t->cap = nr_vecs; 62 + } 63 + 64 + return s; 65 + } 66 + 67 + static void free_stripe_array(struct stripe_array *s) 68 + { 69 + free(s); 70 + } 71 + 72 + static void calculate_stripe_array(const struct stripe_conf *conf, 73 + const struct ublksrv_io_desc *iod, struct stripe_array *s) 74 + { 75 + const unsigned shift = conf->shift - 9; 76 + const unsigned chunk_sects = 1 << shift; 77 + const unsigned unit_sects = conf->nr_files << shift; 78 + off64_t start = iod->start_sector; 79 + off64_t end = start + iod->nr_sectors; 80 + unsigned long done = 0; 81 + unsigned idx = 0; 82 + 83 + while (start < end) { 84 + unsigned nr_sects = chunk_sects - (start & (chunk_sects - 1)); 85 + loff_t unit_off = (start / unit_sects) * unit_sects; 86 + unsigned seq = (start - unit_off) >> shift; 87 + struct stripe *this = &s->s[idx]; 88 + loff_t stripe_off = (unit_off / conf->nr_files) + 89 + (start & (chunk_sects - 1)); 90 + 91 + if (nr_sects > end - start) 92 + nr_sects = end - start; 93 + if (this->nr_sects == 0) { 94 + this->nr_sects = nr_sects; 95 + this->start = stripe_off; 96 + this->seq = seq; 97 + s->nr += 1; 98 + } else { 99 + assert(seq == this->seq); 100 + assert(this->start + this->nr_sects == stripe_off); 101 + this->nr_sects += nr_sects; 102 + } 103 + 104 + assert(this->nr_vec < this->cap); 105 + this->vec[this->nr_vec].iov_base = (void *)(iod->addr + done); 106 + this->vec[this->nr_vec++].iov_len = nr_sects << 9; 107 + 108 + start += nr_sects; 109 + done += nr_sects << 9; 110 + idx = (idx + 1) % conf->nr_files; 111 + } 112 + } 113 + 114 + static inline enum io_uring_op stripe_to_uring_op(const struct ublksrv_io_desc *iod) 115 + { 116 + unsigned ublk_op = ublksrv_get_op(iod); 117 + 118 + if (ublk_op == UBLK_IO_OP_READ) 119 + return IORING_OP_READV; 120 + else if (ublk_op == UBLK_IO_OP_WRITE) 121 + return IORING_OP_WRITEV; 122 + assert(0); 123 + } 124 + 125 + static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) 126 + { 127 + const struct stripe_conf *conf = get_chunk_shift(q); 128 + enum io_uring_op op = stripe_to_uring_op(iod); 129 + struct io_uring_sqe *sqe[NR_STRIPE]; 130 + struct stripe_array *s = alloc_stripe_array(conf, iod); 131 + struct ublk_io *io = ublk_get_io(q, tag); 132 + int i; 133 + 134 + io->private_data = s; 135 + calculate_stripe_array(conf, iod, s); 136 + 137 + ublk_queue_alloc_sqes(q, sqe, s->nr); 138 + for (i = 0; i < s->nr; i++) { 139 + struct stripe *t = &s->s[i]; 140 + 141 + io_uring_prep_rw(op, sqe[i], 142 + t->seq + 1, 143 + (void *)t->vec, 144 + t->nr_vec, 145 + t->start << 9); 146 + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); 147 + /* bit63 marks us as tgt io */ 148 + sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i, 1); 149 + } 150 + return s->nr; 151 + } 152 + 153 + static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) 154 + { 155 + const struct stripe_conf *conf = get_chunk_shift(q); 156 + struct io_uring_sqe *sqe[NR_STRIPE]; 157 + int i; 158 + 159 + ublk_queue_alloc_sqes(q, sqe, conf->nr_files); 160 + for (i = 0; i < conf->nr_files; i++) { 161 + io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC); 162 + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); 163 + sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1); 164 + } 165 + return conf->nr_files; 166 + } 167 + 168 + static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) 169 + { 170 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 171 + unsigned ublk_op = ublksrv_get_op(iod); 172 + int ret = 0; 173 + 174 + switch (ublk_op) { 175 + case UBLK_IO_OP_FLUSH: 176 + ret = handle_flush(q, iod, tag); 177 + break; 178 + case UBLK_IO_OP_WRITE_ZEROES: 179 + case UBLK_IO_OP_DISCARD: 180 + ret = -ENOTSUP; 181 + break; 182 + case UBLK_IO_OP_READ: 183 + case UBLK_IO_OP_WRITE: 184 + ret = stripe_queue_tgt_rw_io(q, iod, tag); 185 + break; 186 + default: 187 + ret = -EINVAL; 188 + break; 189 + } 190 + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u ret %d\n", __func__, tag, 191 + iod->op_flags, iod->start_sector, iod->nr_sectors << 9, ret); 192 + return ret; 193 + } 194 + 195 + static int ublk_stripe_queue_io(struct ublk_queue *q, int tag) 196 + { 197 + int queued = stripe_queue_tgt_io(q, tag); 198 + 199 + ublk_queued_tgt_io(q, tag, queued); 200 + return 0; 201 + } 202 + 203 + static void ublk_stripe_io_done(struct ublk_queue *q, int tag, 204 + const struct io_uring_cqe *cqe) 205 + { 206 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 207 + unsigned op = user_data_to_op(cqe->user_data); 208 + struct ublk_io *io = ublk_get_io(q, tag); 209 + int res = cqe->res; 210 + 211 + if (res < 0) { 212 + if (!io->result) 213 + io->result = res; 214 + ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); 215 + } 216 + 217 + /* fail short READ/WRITE simply */ 218 + if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) { 219 + unsigned seq = user_data_to_tgt_data(cqe->user_data); 220 + struct stripe_array *s = io->private_data; 221 + 222 + if (res < s->s[seq].vec->iov_len) 223 + io->result = -EIO; 224 + } 225 + 226 + if (ublk_completed_tgt_io(q, tag)) { 227 + int res = io->result; 228 + 229 + if (!res) 230 + res = iod->nr_sectors << 9; 231 + 232 + ublk_complete_io(q, tag, res); 233 + 234 + free_stripe_array(io->private_data); 235 + io->private_data = NULL; 236 + } 237 + } 238 + 239 + static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) 240 + { 241 + struct ublk_params p = { 242 + .types = UBLK_PARAM_TYPE_BASIC, 243 + .basic = { 244 + .attrs = UBLK_ATTR_VOLATILE_CACHE, 245 + .logical_bs_shift = 9, 246 + .physical_bs_shift = 12, 247 + .io_opt_shift = 12, 248 + .io_min_shift = 9, 249 + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, 250 + }, 251 + }; 252 + unsigned chunk_size = ctx->chunk_size; 253 + struct stripe_conf *conf; 254 + unsigned chunk_shift; 255 + loff_t bytes = 0; 256 + int ret, i; 257 + 258 + if ((chunk_size & (chunk_size - 1)) || !chunk_size) { 259 + ublk_err("invalid chunk size %u\n", chunk_size); 260 + return -EINVAL; 261 + } 262 + 263 + if (chunk_size < 4096 || chunk_size > 512 * 1024) { 264 + ublk_err("invalid chunk size %u\n", chunk_size); 265 + return -EINVAL; 266 + } 267 + 268 + chunk_shift = ilog2(chunk_size); 269 + 270 + ret = backing_file_tgt_init(dev); 271 + if (ret) 272 + return ret; 273 + 274 + if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) 275 + return -EINVAL; 276 + 277 + assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); 278 + 279 + for (i = 0; i < dev->tgt.nr_backing_files; i++) 280 + dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); 281 + 282 + for (i = 0; i < dev->tgt.nr_backing_files; i++) { 283 + unsigned long size = dev->tgt.backing_file_size[i]; 284 + 285 + if (size != dev->tgt.backing_file_size[0]) 286 + return -EINVAL; 287 + bytes += size; 288 + } 289 + 290 + conf = malloc(sizeof(*conf)); 291 + conf->shift = chunk_shift; 292 + conf->nr_files = dev->tgt.nr_backing_files; 293 + 294 + dev->private_data = conf; 295 + dev->tgt.dev_size = bytes; 296 + p.basic.dev_sectors = bytes >> 9; 297 + dev->tgt.params = p; 298 + dev->tgt.sq_depth = dev->dev_info.queue_depth * conf->nr_files; 299 + dev->tgt.cq_depth = dev->dev_info.queue_depth * conf->nr_files; 300 + 301 + printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files); 302 + 303 + return 0; 304 + } 305 + 306 + static void ublk_stripe_tgt_deinit(struct ublk_dev *dev) 307 + { 308 + free(dev->private_data); 309 + backing_file_tgt_deinit(dev); 310 + } 311 + 312 + const struct ublk_tgt_ops stripe_tgt_ops = { 313 + .name = "stripe", 314 + .init_tgt = ublk_stripe_tgt_init, 315 + .deinit_tgt = ublk_stripe_tgt_deinit, 316 + .queue_io = ublk_stripe_queue_io, 317 + .tgt_io_done = ublk_stripe_io_done, 318 + };
+246
tools/testing/selftests/ublk/test_common.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + UBLK_SKIP_CODE=4 5 + 6 + _have_program() { 7 + if command -v "$1" >/dev/null 2>&1; then 8 + return 0 9 + fi 10 + return 1 11 + } 12 + 13 + _get_disk_dev_t() { 14 + local dev_id=$1 15 + local dev 16 + local major 17 + local minor 18 + 19 + dev=/dev/ublkb"${dev_id}" 20 + major=$(stat -c '%Hr' "$dev") 21 + minor=$(stat -c '%Lr' "$dev") 22 + 23 + echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) 24 + } 25 + 26 + _create_backfile() { 27 + local my_size=$1 28 + local my_file 29 + 30 + my_file=$(mktemp ublk_file_"${my_size}"_XXXXX) 31 + truncate -s "${my_size}" "${my_file}" 32 + echo "$my_file" 33 + } 34 + 35 + _remove_backfile() { 36 + local file=$1 37 + 38 + [ -f "$file" ] && rm -f "$file" 39 + } 40 + 41 + _create_tmp_dir() { 42 + local my_file; 43 + 44 + my_file=$(mktemp -d ublk_dir_XXXXX) 45 + echo "$my_file" 46 + } 47 + 48 + _remove_tmp_dir() { 49 + local dir=$1 50 + 51 + [ -d "$dir" ] && rmdir "$dir" 52 + } 53 + 54 + _mkfs_mount_test() 55 + { 56 + local dev=$1 57 + local err_code=0 58 + local mnt_dir; 59 + 60 + mnt_dir=$(_create_tmp_dir) 61 + mkfs.ext4 -F "$dev" > /dev/null 2>&1 62 + err_code=$? 63 + if [ $err_code -ne 0 ]; then 64 + return $err_code 65 + fi 66 + 67 + mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 68 + umount "$dev" 69 + err_code=$? 70 + _remove_tmp_dir "$mnt_dir" 71 + if [ $err_code -ne 0 ]; then 72 + return $err_code 73 + fi 74 + } 75 + 76 + _check_root() { 77 + local ksft_skip=4 78 + 79 + if [ $UID != 0 ]; then 80 + echo please run this as root >&2 81 + exit $ksft_skip 82 + fi 83 + } 84 + 85 + _remove_ublk_devices() { 86 + ${UBLK_PROG} del -a 87 + modprobe -r ublk_drv > /dev/null 2>&1 88 + } 89 + 90 + _get_ublk_dev_state() { 91 + ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' 92 + } 93 + 94 + _get_ublk_daemon_pid() { 95 + ${UBLK_PROG} list -n "$1" | grep "pid" | awk '{print $7}' 96 + } 97 + 98 + _prep_test() { 99 + _check_root 100 + local type=$1 101 + shift 1 102 + modprobe ublk_drv > /dev/null 2>&1 103 + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" 104 + } 105 + 106 + _remove_test_files() 107 + { 108 + local files=$* 109 + 110 + for file in ${files}; do 111 + [ -f "${file}" ] && rm -f "${file}" 112 + done 113 + } 114 + 115 + _show_result() 116 + { 117 + if [ "$UBLK_TEST_SHOW_RESULT" -ne 0 ]; then 118 + if [ "$2" -eq 0 ]; then 119 + echo "$1 : [PASS]" 120 + elif [ "$2" -eq 4 ]; then 121 + echo "$1 : [SKIP]" 122 + else 123 + echo "$1 : [FAIL]" 124 + fi 125 + fi 126 + [ "$2" -ne 0 ] && exit "$2" 127 + return 0 128 + } 129 + 130 + # don't call from sub-shell, otherwise can't exit 131 + _check_add_dev() 132 + { 133 + local tid=$1 134 + local code=$2 135 + shift 2 136 + if [ "${code}" -ne 0 ]; then 137 + _remove_test_files "$@" 138 + _show_result "${tid}" "${code}" 139 + fi 140 + } 141 + 142 + _cleanup_test() { 143 + "${UBLK_PROG}" del -a 144 + rm -f "$UBLK_TMP" 145 + } 146 + 147 + _have_feature() 148 + { 149 + if $UBLK_PROG "features" | grep "$1" > /dev/null 2>&1; then 150 + return 0 151 + fi 152 + return 1 153 + } 154 + 155 + _add_ublk_dev() { 156 + local kublk_temp; 157 + local dev_id; 158 + 159 + if [ ! -c /dev/ublk-control ]; then 160 + return ${UBLK_SKIP_CODE} 161 + fi 162 + if echo "$@" | grep -q "\-z"; then 163 + if ! _have_feature "ZERO_COPY"; then 164 + return ${UBLK_SKIP_CODE} 165 + fi 166 + fi 167 + 168 + kublk_temp=$(mktemp /tmp/kublk-XXXXXX) 169 + if ! "${UBLK_PROG}" add "$@" > "${kublk_temp}" 2>&1; then 170 + echo "fail to add ublk dev $*" 171 + rm -f "${kublk_temp}" 172 + return 255 173 + fi 174 + 175 + dev_id=$(grep "dev id" "${kublk_temp}" | awk -F '[ :]' '{print $3}') 176 + udevadm settle 177 + rm -f "${kublk_temp}" 178 + echo "${dev_id}" 179 + } 180 + 181 + # kill the ublk daemon and return ublk device state 182 + __ublk_kill_daemon() 183 + { 184 + local dev_id=$1 185 + local exp_state=$2 186 + local daemon_pid 187 + local state 188 + 189 + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") 190 + state=$(_get_ublk_dev_state "${dev_id}") 191 + 192 + for ((j=0;j<50;j++)); do 193 + [ "$state" == "$exp_state" ] && break 194 + kill -9 "$daemon_pid" > /dev/null 2>&1 195 + sleep 1 196 + state=$(_get_ublk_dev_state "${dev_id}") 197 + done 198 + echo "$state" 199 + } 200 + 201 + __remove_ublk_dev_return() { 202 + local dev_id=$1 203 + 204 + ${UBLK_PROG} del -n "${dev_id}" 205 + local res=$? 206 + udevadm settle 207 + return ${res} 208 + } 209 + 210 + __run_io_and_remove() 211 + { 212 + local dev_id=$1 213 + local size=$2 214 + local kill_server=$3 215 + 216 + fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \ 217 + --rw=readwrite --iodepth=64 --size="${size}" --numjobs=4 \ 218 + --runtime=20 --time_based > /dev/null 2>&1 & 219 + sleep 2 220 + if [ "${kill_server}" = "yes" ]; then 221 + local state 222 + state=$(__ublk_kill_daemon "${dev_id}" "DEAD") 223 + if [ "$state" != "DEAD" ]; then 224 + echo "device isn't dead($state) after killing daemon" 225 + return 255 226 + fi 227 + fi 228 + if ! __remove_ublk_dev_return "${dev_id}"; then 229 + echo "delete dev ${dev_id} failed" 230 + return 255 231 + fi 232 + wait 233 + } 234 + 235 + _ublk_test_top_dir() 236 + { 237 + cd "$(dirname "$0")" && pwd 238 + } 239 + 240 + UBLK_TMP=$(mktemp ublk_test_XXXXX) 241 + UBLK_PROG=$(_ublk_test_top_dir)/kublk 242 + UBLK_TEST_QUIET=1 243 + UBLK_TEST_SHOW_RESULT=1 244 + export UBLK_PROG 245 + export UBLK_TEST_QUIET 246 + export UBLK_TEST_SHOW_RESULT
+44
tools/testing/selftests/ublk/test_generic_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="generic_01" 7 + ERR_CODE=0 8 + 9 + if ! _have_program bpftrace; then 10 + exit "$UBLK_SKIP_CODE" 11 + fi 12 + 13 + _prep_test "null" "sequential io order" 14 + 15 + dev_id=$(_add_ublk_dev -t null) 16 + _check_add_dev $TID $? 17 + 18 + dev_t=$(_get_disk_dev_t "$dev_id") 19 + bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & 20 + btrace_pid=$! 21 + sleep 2 22 + 23 + if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then 24 + _cleanup_test "null" 25 + exit "$UBLK_SKIP_CODE" 26 + fi 27 + 28 + # run fio over this ublk disk 29 + fio --name=write_seq \ 30 + --filename=/dev/ublkb"${dev_id}" \ 31 + --ioengine=libaio --iodepth=16 \ 32 + --rw=write \ 33 + --size=512M \ 34 + --direct=1 \ 35 + --bs=4k > /dev/null 2>&1 36 + ERR_CODE=$? 37 + kill "$btrace_pid" 38 + wait 39 + if grep -q "io_out_of_order" "$UBLK_TMP"; then 40 + cat "$UBLK_TMP" 41 + ERR_CODE=255 42 + fi 43 + _cleanup_test "null" 44 + _show_result $TID $ERR_CODE
+32
tools/testing/selftests/ublk/test_loop_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="loop_01" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "write and verify test" 10 + 11 + backfile_0=$(_create_backfile 256M) 12 + 13 + dev_id=$(_add_ublk_dev -t loop "$backfile_0") 14 + _check_add_dev $TID $? "${backfile_0}" 15 + 16 + # run fio over the ublk disk 17 + fio --name=write_and_verify \ 18 + --filename=/dev/ublkb"${dev_id}" \ 19 + --ioengine=libaio --iodepth=16 \ 20 + --rw=write \ 21 + --size=256M \ 22 + --direct=1 \ 23 + --verify=crc32c \ 24 + --do_verify=1 \ 25 + --bs=4k > /dev/null 2>&1 26 + ERR_CODE=$? 27 + 28 + _cleanup_test "loop" 29 + 30 + _remove_backfile "$backfile_0" 31 + 32 + _show_result $TID $ERR_CODE
+22
tools/testing/selftests/ublk/test_loop_02.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="loop_02" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "mkfs & mount & umount" 10 + 11 + backfile_0=$(_create_backfile 256M) 12 + dev_id=$(_add_ublk_dev -t loop "$backfile_0") 13 + _check_add_dev $TID $? "$backfile_0" 14 + 15 + _mkfs_mount_test /dev/ublkb"${dev_id}" 16 + ERR_CODE=$? 17 + 18 + _cleanup_test "loop" 19 + 20 + _remove_backfile "$backfile_0" 21 + 22 + _show_result $TID $ERR_CODE
+31
tools/testing/selftests/ublk/test_loop_03.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="loop_03" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "write and verify over zero copy" 10 + 11 + backfile_0=$(_create_backfile 256M) 12 + dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") 13 + _check_add_dev $TID $? "$backfile_0" 14 + 15 + # run fio over the ublk disk 16 + fio --name=write_and_verify \ 17 + --filename=/dev/ublkb"${dev_id}" \ 18 + --ioengine=libaio --iodepth=64 \ 19 + --rw=write \ 20 + --size=256M \ 21 + --direct=1 \ 22 + --verify=crc32c \ 23 + --do_verify=1 \ 24 + --bs=4k > /dev/null 2>&1 25 + ERR_CODE=$? 26 + 27 + _cleanup_test "loop" 28 + 29 + _remove_backfile "$backfile_0" 30 + 31 + _show_result $TID $ERR_CODE
+22
tools/testing/selftests/ublk/test_loop_04.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="loop_04" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "mkfs & mount & umount with zero copy" 10 + 11 + backfile_0=$(_create_backfile 256M) 12 + dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") 13 + _check_add_dev $TID $? "$backfile_0" 14 + 15 + _mkfs_mount_test /dev/ublkb"${dev_id}" 16 + ERR_CODE=$? 17 + 18 + _cleanup_test "loop" 19 + 20 + _remove_backfile "$backfile_0" 21 + 22 + _show_result $TID $ERR_CODE
+20
tools/testing/selftests/ublk/test_null_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="null_01" 7 + ERR_CODE=0 8 + 9 + _prep_test "null" "basic IO test" 10 + 11 + dev_id=$(_add_ublk_dev -t null) 12 + _check_add_dev $TID $? 13 + 14 + # run fio over the two disks 15 + fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 16 + ERR_CODE=$? 17 + 18 + _cleanup_test "null" 19 + 20 + _show_result $TID $ERR_CODE
+20
tools/testing/selftests/ublk/test_null_02.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="null_02" 7 + ERR_CODE=0 8 + 9 + _prep_test "null" "basic IO test with zero copy" 10 + 11 + dev_id=$(_add_ublk_dev -t null -z) 12 + _check_add_dev $TID $? 13 + 14 + # run fio over the two disks 15 + fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 16 + ERR_CODE=$? 17 + 18 + _cleanup_test "null" 19 + 20 + _show_result $TID $ERR_CODE
+47
tools/testing/selftests/ublk/test_stress_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + TID="stress_01" 6 + ERR_CODE=0 7 + DEV_ID=-1 8 + 9 + ublk_io_and_remove() 10 + { 11 + local size=$1 12 + shift 1 13 + local backfile="" 14 + if echo "$@" | grep -q "loop"; then 15 + backfile=${*: -1} 16 + fi 17 + DEV_ID=$(_add_ublk_dev "$@") 18 + _check_add_dev $TID $? "${backfile}" 19 + 20 + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)" 21 + if ! __run_io_and_remove "${DEV_ID}" "${size}" "no"; then 22 + echo "/dev/ublkc${DEV_ID} isn't removed" 23 + _remove_backfile "${backfile}" 24 + exit 255 25 + fi 26 + } 27 + 28 + _prep_test "stress" "run IO and remove device" 29 + 30 + ublk_io_and_remove 8G -t null 31 + ERR_CODE=$? 32 + if [ ${ERR_CODE} -ne 0 ]; then 33 + _show_result $TID $ERR_CODE 34 + fi 35 + 36 + BACK_FILE=$(_create_backfile 256M) 37 + ublk_io_and_remove 256M -t loop "${BACK_FILE}" 38 + ERR_CODE=$? 39 + if [ ${ERR_CODE} -ne 0 ]; then 40 + _show_result $TID $ERR_CODE 41 + fi 42 + 43 + ublk_io_and_remove 256M -t loop -z "${BACK_FILE}" 44 + ERR_CODE=$? 45 + _cleanup_test "stress" 46 + _remove_backfile "${BACK_FILE}" 47 + _show_result $TID $ERR_CODE
+47
tools/testing/selftests/ublk/test_stress_02.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + TID="stress_02" 6 + ERR_CODE=0 7 + DEV_ID=-1 8 + 9 + ublk_io_and_kill_daemon() 10 + { 11 + local size=$1 12 + shift 1 13 + local backfile="" 14 + if echo "$@" | grep -q "loop"; then 15 + backfile=${*: -1} 16 + fi 17 + DEV_ID=$(_add_ublk_dev "$@") 18 + _check_add_dev $TID $? "${backfile}" 19 + 20 + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs kill ublk server(ublk add $*)" 21 + if ! __run_io_and_remove "${DEV_ID}" "${size}" "yes"; then 22 + echo "/dev/ublkc${DEV_ID} isn't removed res ${res}" 23 + _remove_backfile "${backfile}" 24 + exit 255 25 + fi 26 + } 27 + 28 + _prep_test "stress" "run IO and kill ublk server" 29 + 30 + ublk_io_and_kill_daemon 8G -t null 31 + ERR_CODE=$? 32 + if [ ${ERR_CODE} -ne 0 ]; then 33 + _show_result $TID $ERR_CODE 34 + fi 35 + 36 + BACK_FILE=$(_create_backfile 256M) 37 + ublk_io_and_kill_daemon 256M -t loop "${BACK_FILE}" 38 + ERR_CODE=$? 39 + if [ ${ERR_CODE} -ne 0 ]; then 40 + _show_result $TID $ERR_CODE 41 + fi 42 + 43 + ublk_io_and_kill_daemon 256M -t loop -z "${BACK_FILE}" 44 + ERR_CODE=$? 45 + _cleanup_test "stress" 46 + _remove_backfile "${BACK_FILE}" 47 + _show_result $TID $ERR_CODE
+34
tools/testing/selftests/ublk/test_stripe_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="stripe_01" 7 + ERR_CODE=0 8 + 9 + _prep_test "stripe" "write and verify test" 10 + 11 + backfile_0=$(_create_backfile 256M) 12 + backfile_1=$(_create_backfile 256M) 13 + 14 + dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") 15 + _check_add_dev $TID $? "${backfile_0}" 16 + 17 + # run fio over the ublk disk 18 + fio --name=write_and_verify \ 19 + --filename=/dev/ublkb"${dev_id}" \ 20 + --ioengine=libaio --iodepth=32 \ 21 + --rw=write \ 22 + --size=512M \ 23 + --direct=1 \ 24 + --verify=crc32c \ 25 + --do_verify=1 \ 26 + --bs=4k > /dev/null 2>&1 27 + ERR_CODE=$? 28 + 29 + _cleanup_test "stripe" 30 + 31 + _remove_backfile "$backfile_0" 32 + _remove_backfile "$backfile_1" 33 + 34 + _show_result $TID $ERR_CODE
+24
tools/testing/selftests/ublk/test_stripe_02.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="stripe_02" 7 + ERR_CODE=0 8 + 9 + _prep_test "stripe" "mkfs & mount & umount" 10 + 11 + backfile_0=$(_create_backfile 256M) 12 + backfile_1=$(_create_backfile 256M) 13 + dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") 14 + _check_add_dev $TID $? "$backfile_0" "$backfile_1" 15 + 16 + _mkfs_mount_test /dev/ublkb"${dev_id}" 17 + ERR_CODE=$? 18 + 19 + _cleanup_test "stripe" 20 + 21 + _remove_backfile "$backfile_0" 22 + _remove_backfile "$backfile_1" 23 + 24 + _show_result $TID $ERR_CODE
+25
tools/testing/selftests/ublk/trace/seq_io.bt
··· 1 + /* 2 + $1: dev_t 3 + $2: RWBS 4 + $3: strlen($2) 5 + */ 6 + BEGIN { 7 + @last_rw[$1, str($2)] = 0; 8 + } 9 + tracepoint:block:block_rq_complete 10 + { 11 + $dev = $1; 12 + if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { 13 + $last = @last_rw[$dev, str($2)]; 14 + if ((uint64)args.sector != $last) { 15 + printf("io_out_of_order: exp %llu actual %llu\n", 16 + args.sector, $last); 17 + } 18 + @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); 19 + } 20 + @ios = count(); 21 + } 22 + 23 + END { 24 + clear(@last_rw); 25 + }
+18
tools/testing/selftests/ublk/ublk_dep.h
··· 1 + #ifndef UBLK_DEP_H 2 + #define UBLK_DEP_H 3 + 4 + #ifndef UBLK_U_IO_REGISTER_IO_BUF 5 + #define UBLK_U_IO_REGISTER_IO_BUF \ 6 + _IOWR('u', 0x23, struct ublksrv_io_cmd) 7 + #define UBLK_U_IO_UNREGISTER_IO_BUF \ 8 + _IOWR('u', 0x24, struct ublksrv_io_cmd) 9 + #endif 10 + 11 + #ifndef UBLK_F_USER_RECOVERY_FAIL_IO 12 + #define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) 13 + #endif 14 + 15 + #ifndef UBLK_F_ZONED 16 + #define UBLK_F_ZONED (1ULL << 8) 17 + #endif 18 + #endif