Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-6.15/io_uring' into for-6.15/io_uring-reg-vec

* for-6.15/io_uring: (80 commits)
io_uring: introduce io_cache_free() helper
io_uring/rsrc: skip NULL file/buffer checks in io_free_rsrc_node()
io_uring/rsrc: avoid NULL node check on io_sqe_buffer_register() failure
io_uring/rsrc: call io_free_node() on io_sqe_buffer_register() failure
io_uring/rsrc: free io_rsrc_node using kfree()
io_uring/rsrc: split out io_free_node() helper
io_uring/rsrc: include io_uring_types.h in rsrc.h
ublk: don't cast registered buffer index to int
io_uring/nop: use io_find_buf_node()
io_uring/rsrc: declare io_find_buf_node() in header file
io_uring/ublk: report error when unregister operation fails
io_uring: convert cmd_to_io_kiocb() macro to function
io_uring/uring_cmd: specify io_uring_cmd_import_fixed() pointer type
io_uring/rsrc: use rq_data_dir() to compute bvec dir
selftests: ublk: add ublk zero copy test
selftests: ublk: add file backed ublk
selftests: ublk: add kernel selftests for ublk
io_uring: cache nodes and mapped buffers
ublk: zc register/unregister bvec
io_uring: add support for kernel registered bvecs
...

+2983 -842
+1
MAINTAINERS
··· 24253 24253 F: Documentation/block/ublk.rst 24254 24254 F: drivers/block/ublk_drv.c 24255 24255 F: include/uapi/linux/ublk_cmd.h 24256 + F: tools/testing/selftests/ublk/ 24256 24257 24257 24258 UBSAN 24258 24259 M: Kees Cook <kees@kernel.org>
+49 -7
drivers/block/ublk_drv.c
··· 51 51 /* private ioctl command mirror */ 52 52 #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 53 53 54 + #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) 55 + #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) 56 + 54 57 /* All UBLK_F_* have to be included into UBLK_F_ALL */ 55 58 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 56 59 | UBLK_F_URING_CMD_COMP_IN_TASK \ ··· 199 196 200 197 static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); 201 198 199 + static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 200 + struct ublk_queue *ubq, int tag, size_t offset); 202 201 static inline unsigned int ublk_req_build_flags(struct request *req); 203 202 static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, 204 203 int tag); 205 204 static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) 206 205 { 207 - return ub->dev_info.flags & UBLK_F_USER_COPY; 206 + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); 208 207 } 209 208 210 209 static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) ··· 586 581 587 582 static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) 588 583 { 589 - return ubq->flags & UBLK_F_USER_COPY; 584 + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); 590 585 } 591 586 592 587 static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) ··· 1752 1747 io_uring_cmd_mark_cancelable(cmd, issue_flags); 1753 1748 } 1754 1749 1750 + static void ublk_io_release(void *priv) 1751 + { 1752 + struct request *rq = priv; 1753 + struct ublk_queue *ubq = rq->mq_hctx->driver_data; 1754 + 1755 + ublk_put_req_ref(ubq, rq); 1756 + } 1757 + 1758 + static int ublk_register_io_buf(struct io_uring_cmd *cmd, 1759 + struct ublk_queue *ubq, unsigned int tag, 1760 + unsigned int index, unsigned int issue_flags) 1761 + { 1762 + struct ublk_device *ub = cmd->file->private_data; 1763 + struct request *req; 1764 + int ret; 1765 + 1766 + req = __ublk_check_and_get_req(ub, ubq, tag, 0); 1767 + if (!req) 1768 + return -EINVAL; 1769 + 1770 + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 1771 + issue_flags); 1772 + if (ret) { 1773 + ublk_put_req_ref(ubq, req); 1774 + return ret; 1775 + } 1776 + 1777 + return 0; 1778 + } 1779 + 1780 + static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, 1781 + unsigned int index, unsigned int issue_flags) 1782 + { 1783 + return io_buffer_unregister_bvec(cmd, index, issue_flags); 1784 + } 1785 + 1755 1786 static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, 1756 1787 unsigned int issue_flags, 1757 1788 const struct ublksrv_io_cmd *ub_cmd) ··· 1839 1798 1840 1799 ret = -EINVAL; 1841 1800 switch (_IOC_NR(cmd_op)) { 1801 + case UBLK_IO_REGISTER_IO_BUF: 1802 + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); 1803 + case UBLK_IO_UNREGISTER_IO_BUF: 1804 + return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); 1842 1805 case UBLK_IO_FETCH_REQ: 1843 1806 /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ 1844 1807 if (ublk_queue_ready(ubq)) { ··· 2504 2459 * buffer by pwrite() to ublk char device, which can't be 2505 2460 * used for unprivileged device 2506 2461 */ 2507 - if (info.flags & UBLK_F_USER_COPY) 2462 + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) 2508 2463 return -EINVAL; 2509 2464 } 2510 2465 ··· 2571 2526 ret = -EINVAL; 2572 2527 goto out_free_dev_number; 2573 2528 } 2574 - 2575 - /* We are not ready to support zero copy */ 2576 - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; 2577 2529 2578 2530 ub->dev_info.nr_hw_queues = min_t(unsigned int, 2579 2531 ub->dev_info.nr_hw_queues, nr_cpu_ids); ··· 2902 2860 { 2903 2861 const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); 2904 2862 void __user *argp = (void __user *)(unsigned long)header->addr; 2905 - u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; 2863 + u64 features = UBLK_F_ALL; 2906 2864 2907 2865 if (header->len != UBLK_FEATURES_LEN || !header->addr) 2908 2866 return -EINVAL;
+7 -5
drivers/nvme/host/ioctl.c
··· 114 114 115 115 static int nvme_map_user_request(struct request *req, u64 ubuffer, 116 116 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 117 - struct io_uring_cmd *ioucmd, unsigned int flags) 117 + struct io_uring_cmd *ioucmd, unsigned int flags, 118 + unsigned int iou_issue_flags) 118 119 { 119 120 struct request_queue *q = req->q; 120 121 struct nvme_ns *ns = q->queuedata; ··· 143 142 if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) 144 143 return -EINVAL; 145 144 ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 146 - rq_data_dir(req), &iter, ioucmd); 145 + rq_data_dir(req), &iter, ioucmd, 146 + iou_issue_flags); 147 147 if (ret < 0) 148 148 goto out; 149 149 ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); ··· 196 194 req->timeout = timeout; 197 195 if (ubuffer && bufflen) { 198 196 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 199 - meta_len, NULL, flags); 197 + meta_len, NULL, flags, 0); 200 198 if (ret) 201 199 return ret; 202 200 } ··· 512 510 return PTR_ERR(req); 513 511 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 514 512 515 - if (d.addr && d.data_len) { 513 + if (d.data_len) { 516 514 ret = nvme_map_user_request(req, d.addr, 517 515 d.data_len, nvme_to_user_ptr(d.metadata), 518 - d.metadata_len, ioucmd, vec); 516 + d.metadata_len, ioucmd, vec, issue_flags); 519 517 if (ret) 520 518 return ret; 521 519 }
+14 -3
include/linux/io_uring/cmd.h
··· 4 4 5 5 #include <uapi/linux/io_uring.h> 6 6 #include <linux/io_uring_types.h> 7 + #include <linux/blk-mq.h> 7 8 8 9 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ 9 10 #define IORING_URING_CMD_CANCELABLE (1U << 30) ··· 40 39 41 40 #if defined(CONFIG_IO_URING) 42 41 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 43 - struct iov_iter *iter, void *ioucmd); 42 + struct iov_iter *iter, 43 + struct io_uring_cmd *ioucmd, 44 + unsigned int issue_flags); 44 45 45 46 /* 46 47 * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd ··· 69 66 void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); 70 67 71 68 #else 72 - static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 73 - struct iov_iter *iter, void *ioucmd) 69 + static inline int 70 + io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 71 + struct iov_iter *iter, struct io_uring_cmd *ioucmd, 72 + unsigned int issue_flags) 74 73 { 75 74 return -EOPNOTSUPP; 76 75 } ··· 127 122 { 128 123 return cmd_to_io_kiocb(cmd)->async_data; 129 124 } 125 + 126 + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 127 + void (*release)(void *), unsigned int index, 128 + unsigned int issue_flags); 129 + int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 130 + unsigned int issue_flags); 130 131 131 132 #endif /* _LINUX_IO_URING_CMD_H */
+15 -5
include/linux/io_uring_types.h
··· 292 292 293 293 struct io_file_table file_table; 294 294 struct io_rsrc_data buf_table; 295 + struct io_alloc_cache node_cache; 296 + struct io_alloc_cache imu_cache; 295 297 296 298 struct io_submit_state submit_state; 297 299 ··· 362 360 363 361 spinlock_t completion_lock; 364 362 365 - struct list_head io_buffers_comp; 366 363 struct list_head cq_overflow_list; 367 364 368 365 struct hlist_head waitid_list; ··· 379 378 380 379 unsigned int file_alloc_start; 381 380 unsigned int file_alloc_end; 382 - 383 - struct list_head io_buffers_cache; 384 381 385 382 /* Keep this last, we don't need it for the fast path */ 386 383 struct wait_queue_head poll_wq; ··· 438 439 struct io_mapped_region param_region; 439 440 }; 440 441 442 + /* 443 + * Token indicating function is called in task work context: 444 + * ctx->uring_lock is held and any completions generated will be flushed. 445 + * ONLY core io_uring.c should instantiate this struct. 446 + */ 441 447 struct io_tw_state { 442 448 }; 449 + /* Alias to use in code that doesn't instantiate struct io_tw_state */ 450 + typedef struct io_tw_state io_tw_token_t; 443 451 444 452 enum { 445 453 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, ··· 572 566 REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), 573 567 }; 574 568 575 - typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); 569 + typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); 576 570 577 571 struct io_task_work { 578 572 struct llist_node node; ··· 607 601 io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ 608 602 ((cmd_type *)&(req)->cmd) \ 609 603 ) 610 - #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) 604 + 605 + static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) 606 + { 607 + return ptr; 608 + } 611 609 612 610 struct io_kiocb { 613 611 union {
+4
include/uapi/linux/ublk_cmd.h
··· 94 94 _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) 95 95 #define UBLK_U_IO_NEED_GET_DATA \ 96 96 _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) 97 + #define UBLK_U_IO_REGISTER_IO_BUF \ 98 + _IOWR('u', 0x23, struct ublksrv_io_cmd) 99 + #define UBLK_U_IO_UNREGISTER_IO_BUF \ 100 + _IOWR('u', 0x24, struct ublksrv_io_cmd) 97 101 98 102 /* only ABORT means that no re-fetch */ 99 103 #define UBLK_IO_RES_OK 0
+6
io_uring/alloc_cache.h
··· 68 68 return io_cache_alloc_new(cache, gfp); 69 69 } 70 70 71 + static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) 72 + { 73 + if (!io_alloc_cache_put(cache, obj)) 74 + kfree(obj); 75 + } 76 + 71 77 #endif
+42
io_uring/cancel.c
··· 341 341 fput(file); 342 342 return ret; 343 343 } 344 + 345 + bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 346 + struct hlist_head *list, bool cancel_all, 347 + bool (*cancel)(struct io_kiocb *)) 348 + { 349 + struct hlist_node *tmp; 350 + struct io_kiocb *req; 351 + bool found = false; 352 + 353 + lockdep_assert_held(&ctx->uring_lock); 354 + 355 + hlist_for_each_entry_safe(req, tmp, list, hash_node) { 356 + if (!io_match_task_safe(req, tctx, cancel_all)) 357 + continue; 358 + hlist_del_init(&req->hash_node); 359 + if (cancel(req)) 360 + found = true; 361 + } 362 + 363 + return found; 364 + } 365 + 366 + int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 367 + unsigned int issue_flags, struct hlist_head *list, 368 + bool (*cancel)(struct io_kiocb *)) 369 + { 370 + struct hlist_node *tmp; 371 + struct io_kiocb *req; 372 + int nr = 0; 373 + 374 + io_ring_submit_lock(ctx, issue_flags); 375 + hlist_for_each_entry_safe(req, tmp, list, hash_node) { 376 + if (!io_cancel_req_match(req, cd)) 377 + continue; 378 + if (cancel(req)) 379 + nr++; 380 + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 381 + break; 382 + } 383 + io_ring_submit_unlock(ctx, issue_flags); 384 + return nr ?: -ENOENT; 385 + }
+8
io_uring/cancel.h
··· 24 24 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); 25 25 bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); 26 26 27 + bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 28 + struct hlist_head *list, bool cancel_all, 29 + bool (*cancel)(struct io_kiocb *)); 30 + 31 + int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 32 + unsigned int issue_flags, struct hlist_head *list, 33 + bool (*cancel)(struct io_kiocb *)); 34 + 27 35 static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) 28 36 { 29 37 if (req->cancel_seq_set && sequence == req->work.cancel_seq)
+1 -1
io_uring/filetable.c
··· 68 68 if (slot_index >= ctx->file_table.data.nr) 69 69 return -EINVAL; 70 70 71 - node = io_rsrc_node_alloc(IORING_RSRC_FILE); 71 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 72 72 if (!node) 73 73 return -ENOMEM; 74 74
+12 -50
io_uring/futex.c
··· 44 44 io_alloc_cache_free(&ctx->futex_cache, kfree); 45 45 } 46 46 47 - static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) 47 + static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 48 48 { 49 49 req->async_data = NULL; 50 50 hlist_del_init(&req->hash_node); 51 - io_req_task_complete(req, ts); 51 + io_req_task_complete(req, tw); 52 52 } 53 53 54 - static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) 54 + static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 55 55 { 56 - struct io_futex_data *ifd = req->async_data; 57 56 struct io_ring_ctx *ctx = req->ctx; 58 57 59 - io_tw_lock(ctx, ts); 60 - if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) 61 - kfree(ifd); 62 - __io_futex_complete(req, ts); 58 + io_tw_lock(ctx, tw); 59 + io_cache_free(&ctx->futex_cache, req->async_data); 60 + __io_futex_complete(req, tw); 63 61 } 64 62 65 - static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) 63 + static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) 66 64 { 67 65 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 68 66 struct futex_vector *futexv = req->async_data; 69 67 70 - io_tw_lock(req->ctx, ts); 68 + io_tw_lock(req->ctx, tw); 71 69 72 70 if (!iof->futexv_unqueued) { 73 71 int res; ··· 77 79 78 80 kfree(req->async_data); 79 81 req->flags &= ~REQ_F_ASYNC_DATA; 80 - __io_futex_complete(req, ts); 82 + __io_futex_complete(req, tw); 81 83 } 82 84 83 85 static bool io_futexv_claim(struct io_futex *iof) ··· 88 90 return true; 89 91 } 90 92 91 - static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 93 + static bool __io_futex_cancel(struct io_kiocb *req) 92 94 { 93 95 /* futex wake already done or in progress */ 94 96 if (req->opcode == IORING_OP_FUTEX_WAIT) { ··· 114 116 int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 115 117 unsigned int issue_flags) 116 118 { 117 - struct hlist_node *tmp; 118 - struct io_kiocb *req; 119 - int nr = 0; 120 - 121 - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) 122 - return -ENOENT; 123 - 124 - io_ring_submit_lock(ctx, issue_flags); 125 - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { 126 - if (req->cqe.user_data != cd->data && 127 - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) 128 - continue; 129 - if (__io_futex_cancel(ctx, req)) 130 - nr++; 131 - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 132 - break; 133 - } 134 - io_ring_submit_unlock(ctx, issue_flags); 135 - 136 - if (nr) 137 - return nr; 138 - 139 - return -ENOENT; 119 + return io_cancel_remove(ctx, cd, issue_flags, &ctx->futex_list, __io_futex_cancel); 140 120 } 141 121 142 122 bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 143 123 bool cancel_all) 144 124 { 145 - struct hlist_node *tmp; 146 - struct io_kiocb *req; 147 - bool found = false; 148 - 149 - lockdep_assert_held(&ctx->uring_lock); 150 - 151 - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { 152 - if (!io_match_task_safe(req, tctx, cancel_all)) 153 - continue; 154 - hlist_del_init(&req->hash_node); 155 - __io_futex_cancel(ctx, req); 156 - found = true; 157 - } 158 - 159 - return found; 125 + return io_cancel_remove_all(ctx, tctx, &ctx->futex_list, cancel_all, __io_futex_cancel); 160 126 } 161 127 162 128 int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+131 -99
io_uring/io-wq.c
··· 30 30 IO_WORKER_F_UP = 0, /* up and active */ 31 31 IO_WORKER_F_RUNNING = 1, /* account as running */ 32 32 IO_WORKER_F_FREE = 2, /* worker on free list */ 33 - IO_WORKER_F_BOUND = 3, /* is doing bounded work */ 34 33 }; 35 34 36 35 enum { ··· 45 46 */ 46 47 struct io_worker { 47 48 refcount_t ref; 48 - int create_index; 49 49 unsigned long flags; 50 50 struct hlist_nulls_node nulls_node; 51 51 struct list_head all_list; 52 52 struct task_struct *task; 53 53 struct io_wq *wq; 54 + struct io_wq_acct *acct; 54 55 55 56 struct io_wq_work *cur_work; 56 57 raw_spinlock_t lock; ··· 76 77 #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) 77 78 78 79 struct io_wq_acct { 80 + /** 81 + * Protects access to the worker lists. 82 + */ 83 + raw_spinlock_t workers_lock; 84 + 79 85 unsigned nr_workers; 80 86 unsigned max_workers; 81 - int index; 82 87 atomic_t nr_running; 88 + 89 + /** 90 + * The list of free workers. Protected by #workers_lock 91 + * (write) and RCU (read). 92 + */ 93 + struct hlist_nulls_head free_list; 94 + 95 + /** 96 + * The list of all workers. Protected by #workers_lock 97 + * (write) and RCU (read). 98 + */ 99 + struct list_head all_list; 100 + 83 101 raw_spinlock_t lock; 84 102 struct io_wq_work_list work_list; 85 103 unsigned long flags; ··· 128 112 129 113 struct io_wq_acct acct[IO_WQ_ACCT_NR]; 130 114 131 - /* lock protects access to elements below */ 132 - raw_spinlock_t lock; 133 - 134 - struct hlist_nulls_head free_list; 135 - struct list_head all_list; 136 - 137 115 struct wait_queue_entry wait; 138 116 139 117 struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; ··· 145 135 bool cancel_all; 146 136 }; 147 137 148 - static bool create_io_worker(struct io_wq *wq, int index); 138 + static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); 149 139 static void io_wq_dec_running(struct io_worker *worker); 150 140 static bool io_acct_cancel_pending_work(struct io_wq *wq, 151 141 struct io_wq_acct *acct, ··· 170 160 } 171 161 172 162 static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, 173 - struct io_wq_work *work) 163 + unsigned int work_flags) 174 164 { 175 - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); 165 + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); 176 166 } 177 167 178 168 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) 179 169 { 180 - return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); 170 + return worker->acct; 181 171 } 182 172 183 173 static void io_worker_ref_put(struct io_wq *wq) ··· 202 192 struct io_wq *wq = worker->wq; 203 193 204 194 atomic_dec(&acct->nr_running); 205 - raw_spin_lock(&wq->lock); 195 + raw_spin_lock(&acct->workers_lock); 206 196 acct->nr_workers--; 207 - raw_spin_unlock(&wq->lock); 197 + raw_spin_unlock(&acct->workers_lock); 208 198 io_worker_ref_put(wq); 209 199 clear_bit_unlock(0, &worker->create_state); 210 200 io_worker_release(worker); ··· 223 213 static void io_worker_exit(struct io_worker *worker) 224 214 { 225 215 struct io_wq *wq = worker->wq; 216 + struct io_wq_acct *acct = io_wq_get_acct(worker); 226 217 227 218 while (1) { 228 219 struct callback_head *cb = task_work_cancel_match(wq->task, ··· 237 226 io_worker_release(worker); 238 227 wait_for_completion(&worker->ref_done); 239 228 240 - raw_spin_lock(&wq->lock); 229 + raw_spin_lock(&acct->workers_lock); 241 230 if (test_bit(IO_WORKER_F_FREE, &worker->flags)) 242 231 hlist_nulls_del_rcu(&worker->nulls_node); 243 232 list_del_rcu(&worker->all_list); 244 - raw_spin_unlock(&wq->lock); 233 + raw_spin_unlock(&acct->workers_lock); 245 234 io_wq_dec_running(worker); 246 235 /* 247 236 * this worker is a goner, clear ->worker_private to avoid any ··· 280 269 * Check head of free list for an available worker. If one isn't available, 281 270 * caller must create one. 282 271 */ 283 - static bool io_wq_activate_free_worker(struct io_wq *wq, 284 - struct io_wq_acct *acct) 272 + static bool io_acct_activate_free_worker(struct io_wq_acct *acct) 285 273 __must_hold(RCU) 286 274 { 287 275 struct hlist_nulls_node *n; ··· 291 281 * activate. If a given worker is on the free_list but in the process 292 282 * of exiting, keep trying. 293 283 */ 294 - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { 284 + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { 295 285 if (!io_worker_get(worker)) 296 286 continue; 297 - if (io_wq_get_acct(worker) != acct) { 298 - io_worker_release(worker); 299 - continue; 300 - } 301 287 /* 302 288 * If the worker is already running, it's either already 303 289 * starting work or finishing work. In either case, if it does ··· 320 314 if (unlikely(!acct->max_workers)) 321 315 pr_warn_once("io-wq is not configured for unbound workers"); 322 316 323 - raw_spin_lock(&wq->lock); 317 + raw_spin_lock(&acct->workers_lock); 324 318 if (acct->nr_workers >= acct->max_workers) { 325 - raw_spin_unlock(&wq->lock); 319 + raw_spin_unlock(&acct->workers_lock); 326 320 return true; 327 321 } 328 322 acct->nr_workers++; 329 - raw_spin_unlock(&wq->lock); 323 + raw_spin_unlock(&acct->workers_lock); 330 324 atomic_inc(&acct->nr_running); 331 325 atomic_inc(&wq->worker_refs); 332 - return create_io_worker(wq, acct->index); 326 + return create_io_worker(wq, acct); 333 327 } 334 328 335 329 static void io_wq_inc_running(struct io_worker *worker) ··· 349 343 350 344 worker = container_of(cb, struct io_worker, create_work); 351 345 wq = worker->wq; 352 - acct = &wq->acct[worker->create_index]; 353 - raw_spin_lock(&wq->lock); 346 + acct = worker->acct; 347 + raw_spin_lock(&acct->workers_lock); 354 348 355 349 if (acct->nr_workers < acct->max_workers) { 356 350 acct->nr_workers++; 357 351 do_create = true; 358 352 } 359 - raw_spin_unlock(&wq->lock); 353 + raw_spin_unlock(&acct->workers_lock); 360 354 if (do_create) { 361 - create_io_worker(wq, worker->create_index); 355 + create_io_worker(wq, acct); 362 356 } else { 363 357 atomic_dec(&acct->nr_running); 364 358 io_worker_ref_put(wq); ··· 390 384 391 385 atomic_inc(&wq->worker_refs); 392 386 init_task_work(&worker->create_work, func); 393 - worker->create_index = acct->index; 394 387 if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { 395 388 /* 396 389 * EXIT may have been set after checking it above, check after ··· 435 430 * Worker will start processing some work. Move it to the busy list, if 436 431 * it's currently on the freelist 437 432 */ 438 - static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) 433 + static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) 439 434 { 440 435 if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { 441 436 clear_bit(IO_WORKER_F_FREE, &worker->flags); 442 - raw_spin_lock(&wq->lock); 437 + raw_spin_lock(&acct->workers_lock); 443 438 hlist_nulls_del_init_rcu(&worker->nulls_node); 444 - raw_spin_unlock(&wq->lock); 439 + raw_spin_unlock(&acct->workers_lock); 445 440 } 446 441 } 447 442 448 443 /* 449 444 * No work, worker going to sleep. Move to freelist. 450 445 */ 451 - static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) 452 - __must_hold(wq->lock) 446 + static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) 447 + __must_hold(acct->workers_lock) 453 448 { 454 449 if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { 455 450 set_bit(IO_WORKER_F_FREE, &worker->flags); 456 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 451 + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); 457 452 } 453 + } 454 + 455 + static inline unsigned int __io_get_work_hash(unsigned int work_flags) 456 + { 457 + return work_flags >> IO_WQ_HASH_SHIFT; 458 458 } 459 459 460 460 static inline unsigned int io_get_work_hash(struct io_wq_work *work) 461 461 { 462 - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; 462 + return __io_get_work_hash(atomic_read(&work->flags)); 463 463 } 464 464 465 465 static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) ··· 485 475 } 486 476 487 477 static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, 488 - struct io_worker *worker) 478 + struct io_wq *wq) 489 479 __must_hold(acct->lock) 490 480 { 491 481 struct io_wq_work_node *node, *prev; 492 482 struct io_wq_work *work, *tail; 493 483 unsigned int stall_hash = -1U; 494 - struct io_wq *wq = worker->wq; 495 484 496 485 wq_list_for_each(node, prev, &acct->work_list) { 486 + unsigned int work_flags; 497 487 unsigned int hash; 498 488 499 489 work = container_of(node, struct io_wq_work, list); 500 490 501 491 /* not hashed, can run anytime */ 502 - if (!io_wq_is_hashed(work)) { 492 + work_flags = atomic_read(&work->flags); 493 + if (!__io_wq_is_hashed(work_flags)) { 503 494 wq_list_del(&acct->work_list, node, prev); 504 495 return work; 505 496 } 506 497 507 - hash = io_get_work_hash(work); 498 + hash = __io_get_work_hash(work_flags); 508 499 /* all items with this hash lie in [work, tail] */ 509 500 tail = wq->hash_tail[hash]; 510 501 ··· 575 564 * can't make progress, any work completion or insertion will 576 565 * clear the stalled flag. 577 566 */ 578 - work = io_get_next_work(acct, worker); 567 + work = io_get_next_work(acct, wq); 579 568 if (work) { 580 569 /* 581 570 * Make sure cancelation can find this, even before ··· 594 583 if (!work) 595 584 break; 596 585 597 - __io_worker_busy(wq, worker); 586 + __io_worker_busy(acct, worker); 598 587 599 588 io_assign_current_work(worker, work); 600 589 __set_current_state(TASK_RUNNING); ··· 602 591 /* handle a whole dependent link */ 603 592 do { 604 593 struct io_wq_work *next_hashed, *linked; 605 - unsigned int hash = io_get_work_hash(work); 594 + unsigned int work_flags = atomic_read(&work->flags); 595 + unsigned int hash = __io_wq_is_hashed(work_flags) 596 + ? __io_get_work_hash(work_flags) 597 + : -1U; 606 598 607 599 next_hashed = wq_next_work(work); 608 600 609 601 if (do_kill && 610 - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) 602 + (work_flags & IO_WQ_WORK_UNBOUND)) 611 603 atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 612 604 wq->do_work(work); 613 605 io_assign_current_work(worker, NULL); ··· 668 654 while (io_acct_run_queue(acct)) 669 655 io_worker_handle_work(acct, worker); 670 656 671 - raw_spin_lock(&wq->lock); 657 + raw_spin_lock(&acct->workers_lock); 672 658 /* 673 659 * Last sleep timed out. Exit if we're not the last worker, 674 660 * or if someone modified our affinity. 675 661 */ 676 662 if (last_timeout && (exit_mask || acct->nr_workers > 1)) { 677 663 acct->nr_workers--; 678 - raw_spin_unlock(&wq->lock); 664 + raw_spin_unlock(&acct->workers_lock); 679 665 __set_current_state(TASK_RUNNING); 680 666 break; 681 667 } 682 668 last_timeout = false; 683 - __io_worker_idle(wq, worker); 684 - raw_spin_unlock(&wq->lock); 669 + __io_worker_idle(acct, worker); 670 + raw_spin_unlock(&acct->workers_lock); 685 671 if (io_run_task_work()) 686 672 continue; 687 673 ret = schedule_timeout(WORKER_IDLE_TIMEOUT); ··· 742 728 io_wq_dec_running(worker); 743 729 } 744 730 745 - static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, 731 + static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, 746 732 struct task_struct *tsk) 747 733 { 748 734 tsk->worker_private = worker; 749 735 worker->task = tsk; 750 736 set_cpus_allowed_ptr(tsk, wq->cpu_mask); 751 737 752 - raw_spin_lock(&wq->lock); 753 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 754 - list_add_tail_rcu(&worker->all_list, &wq->all_list); 738 + raw_spin_lock(&acct->workers_lock); 739 + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); 740 + list_add_tail_rcu(&worker->all_list, &acct->all_list); 755 741 set_bit(IO_WORKER_F_FREE, &worker->flags); 756 - raw_spin_unlock(&wq->lock); 742 + raw_spin_unlock(&acct->workers_lock); 757 743 wake_up_new_task(tsk); 758 744 } 759 745 ··· 801 787 struct io_worker *worker; 802 788 struct task_struct *tsk; 803 789 struct io_wq *wq; 790 + struct io_wq_acct *acct; 804 791 805 792 worker = container_of(cb, struct io_worker, create_work); 806 793 clear_bit_unlock(0, &worker->create_state); 807 794 wq = worker->wq; 795 + acct = io_wq_get_acct(worker); 808 796 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 809 797 if (!IS_ERR(tsk)) { 810 - io_init_new_worker(wq, worker, tsk); 798 + io_init_new_worker(wq, acct, worker, tsk); 811 799 io_worker_release(worker); 812 800 return; 813 801 } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 814 - struct io_wq_acct *acct = io_wq_get_acct(worker); 815 - 816 802 atomic_dec(&acct->nr_running); 817 - raw_spin_lock(&wq->lock); 803 + raw_spin_lock(&acct->workers_lock); 818 804 acct->nr_workers--; 819 805 if (!acct->nr_workers) { 820 806 struct io_cb_cancel_data match = { ··· 822 808 .cancel_all = true, 823 809 }; 824 810 825 - raw_spin_unlock(&wq->lock); 811 + raw_spin_unlock(&acct->workers_lock); 826 812 while (io_acct_cancel_pending_work(wq, acct, &match)) 827 813 ; 828 814 } else { 829 - raw_spin_unlock(&wq->lock); 815 + raw_spin_unlock(&acct->workers_lock); 830 816 } 831 817 io_worker_ref_put(wq); 832 818 kfree(worker); ··· 848 834 kfree(worker); 849 835 } 850 836 851 - static bool create_io_worker(struct io_wq *wq, int index) 837 + static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) 852 838 { 853 - struct io_wq_acct *acct = &wq->acct[index]; 854 839 struct io_worker *worker; 855 840 struct task_struct *tsk; 856 841 ··· 859 846 if (!worker) { 860 847 fail: 861 848 atomic_dec(&acct->nr_running); 862 - raw_spin_lock(&wq->lock); 849 + raw_spin_lock(&acct->workers_lock); 863 850 acct->nr_workers--; 864 - raw_spin_unlock(&wq->lock); 851 + raw_spin_unlock(&acct->workers_lock); 865 852 io_worker_ref_put(wq); 866 853 return false; 867 854 } 868 855 869 856 refcount_set(&worker->ref, 1); 870 857 worker->wq = wq; 858 + worker->acct = acct; 871 859 raw_spin_lock_init(&worker->lock); 872 860 init_completion(&worker->ref_done); 873 861 874 - if (index == IO_WQ_ACCT_BOUND) 875 - set_bit(IO_WORKER_F_BOUND, &worker->flags); 876 - 877 862 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 878 863 if (!IS_ERR(tsk)) { 879 - io_init_new_worker(wq, worker, tsk); 864 + io_init_new_worker(wq, acct, worker, tsk); 880 865 } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 881 866 kfree(worker); 882 867 goto fail; ··· 890 879 * Iterate the passed in list and call the specific function for each 891 880 * worker that isn't exiting 892 881 */ 893 - static bool io_wq_for_each_worker(struct io_wq *wq, 894 - bool (*func)(struct io_worker *, void *), 895 - void *data) 882 + static bool io_acct_for_each_worker(struct io_wq_acct *acct, 883 + bool (*func)(struct io_worker *, void *), 884 + void *data) 896 885 { 897 886 struct io_worker *worker; 898 887 bool ret = false; 899 888 900 - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { 889 + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { 901 890 if (io_worker_get(worker)) { 902 891 /* no task if node is/was offline */ 903 892 if (worker->task) ··· 909 898 } 910 899 911 900 return ret; 901 + } 902 + 903 + static bool io_wq_for_each_worker(struct io_wq *wq, 904 + bool (*func)(struct io_worker *, void *), 905 + void *data) 906 + { 907 + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { 908 + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) 909 + return false; 910 + } 911 + 912 + return true; 912 913 } 913 914 914 915 static bool io_wq_worker_wake(struct io_worker *worker, void *data) ··· 939 916 } while (work); 940 917 } 941 918 942 - static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) 919 + static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, 920 + struct io_wq_work *work, unsigned int work_flags) 943 921 { 944 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 945 922 unsigned int hash; 946 923 struct io_wq_work *tail; 947 924 948 - if (!io_wq_is_hashed(work)) { 925 + if (!__io_wq_is_hashed(work_flags)) { 949 926 append: 950 927 wq_list_add_tail(&work->list, &acct->work_list); 951 928 return; 952 929 } 953 930 954 - hash = io_get_work_hash(work); 931 + hash = __io_get_work_hash(work_flags); 955 932 tail = wq->hash_tail[hash]; 956 933 wq->hash_tail[hash] = work; 957 934 if (!tail) ··· 967 944 968 945 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) 969 946 { 970 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 971 947 unsigned int work_flags = atomic_read(&work->flags); 948 + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); 972 949 struct io_cb_cancel_data match = { 973 950 .fn = io_wq_work_match_item, 974 951 .data = work, ··· 987 964 } 988 965 989 966 raw_spin_lock(&acct->lock); 990 - io_wq_insert_work(wq, work); 967 + io_wq_insert_work(wq, acct, work, work_flags); 991 968 clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); 992 969 raw_spin_unlock(&acct->lock); 993 970 994 971 rcu_read_lock(); 995 - do_create = !io_wq_activate_free_worker(wq, acct); 972 + do_create = !io_acct_activate_free_worker(acct); 996 973 rcu_read_unlock(); 997 974 998 975 if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || ··· 1003 980 if (likely(did_create)) 1004 981 return; 1005 982 1006 - raw_spin_lock(&wq->lock); 983 + raw_spin_lock(&acct->workers_lock); 1007 984 if (acct->nr_workers) { 1008 - raw_spin_unlock(&wq->lock); 985 + raw_spin_unlock(&acct->workers_lock); 1009 986 return; 1010 987 } 1011 - raw_spin_unlock(&wq->lock); 988 + raw_spin_unlock(&acct->workers_lock); 1012 989 1013 990 /* fatal condition, failed to create the first worker */ 1014 991 io_acct_cancel_pending_work(wq, acct, &match); ··· 1057 1034 } 1058 1035 1059 1036 static inline void io_wq_remove_pending(struct io_wq *wq, 1037 + struct io_wq_acct *acct, 1060 1038 struct io_wq_work *work, 1061 1039 struct io_wq_work_node *prev) 1062 1040 { 1063 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 1064 1041 unsigned int hash = io_get_work_hash(work); 1065 1042 struct io_wq_work *prev_work = NULL; 1066 1043 ··· 1087 1064 work = container_of(node, struct io_wq_work, list); 1088 1065 if (!match->fn(work, match->data)) 1089 1066 continue; 1090 - io_wq_remove_pending(wq, work, prev); 1067 + io_wq_remove_pending(wq, acct, work, prev); 1091 1068 raw_spin_unlock(&acct->lock); 1092 1069 io_run_cancel(work, wq); 1093 1070 match->nr_pending++; ··· 1115 1092 } 1116 1093 } 1117 1094 1095 + static void io_acct_cancel_running_work(struct io_wq_acct *acct, 1096 + struct io_cb_cancel_data *match) 1097 + { 1098 + raw_spin_lock(&acct->workers_lock); 1099 + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); 1100 + raw_spin_unlock(&acct->workers_lock); 1101 + } 1102 + 1118 1103 static void io_wq_cancel_running_work(struct io_wq *wq, 1119 1104 struct io_cb_cancel_data *match) 1120 1105 { 1121 1106 rcu_read_lock(); 1122 - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); 1107 + 1108 + for (int i = 0; i < IO_WQ_ACCT_NR; i++) 1109 + io_acct_cancel_running_work(&wq->acct[i], match); 1110 + 1123 1111 rcu_read_unlock(); 1124 1112 } 1125 1113 ··· 1153 1119 * as an indication that we attempt to signal cancellation. The 1154 1120 * completion will run normally in this case. 1155 1121 * 1156 - * Do both of these while holding the wq->lock, to ensure that 1122 + * Do both of these while holding the acct->workers_lock, to ensure that 1157 1123 * we'll find a work item regardless of state. 1158 1124 */ 1159 1125 io_wq_cancel_pending_work(wq, &match); 1160 1126 if (match.nr_pending && !match.cancel_all) 1161 1127 return IO_WQ_CANCEL_OK; 1162 1128 1163 - raw_spin_lock(&wq->lock); 1164 1129 io_wq_cancel_running_work(wq, &match); 1165 - raw_spin_unlock(&wq->lock); 1166 1130 if (match.nr_running && !match.cancel_all) 1167 1131 return IO_WQ_CANCEL_RUNNING; 1168 1132 ··· 1184 1152 struct io_wq_acct *acct = &wq->acct[i]; 1185 1153 1186 1154 if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) 1187 - io_wq_activate_free_worker(wq, acct); 1155 + io_acct_activate_free_worker(acct); 1188 1156 } 1189 1157 rcu_read_unlock(); 1190 1158 return 1; ··· 1222 1190 for (i = 0; i < IO_WQ_ACCT_NR; i++) { 1223 1191 struct io_wq_acct *acct = &wq->acct[i]; 1224 1192 1225 - acct->index = i; 1226 1193 atomic_set(&acct->nr_running, 0); 1194 + 1195 + raw_spin_lock_init(&acct->workers_lock); 1196 + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); 1197 + INIT_LIST_HEAD(&acct->all_list); 1198 + 1227 1199 INIT_WQ_LIST(&acct->work_list); 1228 1200 raw_spin_lock_init(&acct->lock); 1229 1201 } 1230 - 1231 - raw_spin_lock_init(&wq->lock); 1232 - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); 1233 - INIT_LIST_HEAD(&wq->all_list); 1234 1202 1235 1203 wq->task = get_task_struct(data->task); 1236 1204 atomic_set(&wq->worker_refs, 1); ··· 1417 1385 1418 1386 rcu_read_lock(); 1419 1387 1420 - raw_spin_lock(&wq->lock); 1421 1388 for (i = 0; i < IO_WQ_ACCT_NR; i++) { 1422 1389 acct = &wq->acct[i]; 1390 + raw_spin_lock(&acct->workers_lock); 1423 1391 prev[i] = max_t(int, acct->max_workers, prev[i]); 1424 1392 if (new_count[i]) 1425 1393 acct->max_workers = new_count[i]; 1394 + raw_spin_unlock(&acct->workers_lock); 1426 1395 } 1427 - raw_spin_unlock(&wq->lock); 1428 1396 rcu_read_unlock(); 1429 1397 1430 1398 for (i = 0; i < IO_WQ_ACCT_NR; i++)
+6 -1
io_uring/io-wq.h
··· 54 54 int io_wq_max_workers(struct io_wq *wq, int *new_count); 55 55 bool io_wq_worker_stopped(void); 56 56 57 + static inline bool __io_wq_is_hashed(unsigned int work_flags) 58 + { 59 + return work_flags & IO_WQ_WORK_HASHED; 60 + } 61 + 57 62 static inline bool io_wq_is_hashed(struct io_wq_work *work) 58 63 { 59 - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; 64 + return __io_wq_is_hashed(atomic_read(&work->flags)); 60 65 } 61 66 62 67 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
+126 -91
io_uring/io_uring.c
··· 110 110 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ 111 111 IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) 112 112 113 + #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 114 + 113 115 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 114 116 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ 115 117 REQ_F_ASYNC_DATA) 116 118 117 - #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ 119 + #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ 118 120 REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) 119 121 120 122 #define IO_TCTX_REFS_CACHE_NR (1U << 10) ··· 133 131 134 132 /* requests with any of those set should undergo io_disarm_next() */ 135 133 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 136 - #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 137 134 138 135 /* 139 136 * No waiters. It's larger than any valid value of the tw counter ··· 255 254 percpu_ref_get(&ctx->refs); 256 255 mutex_lock(&ctx->uring_lock); 257 256 llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 258 - req->io_task_work.func(req, &ts); 257 + req->io_task_work.func(req, ts); 259 258 io_submit_flush_completions(ctx); 260 259 mutex_unlock(&ctx->uring_lock); 261 260 percpu_ref_put(&ctx->refs); ··· 281 280 for (i = 0; i < hash_buckets; i++) 282 281 INIT_HLIST_HEAD(&table->hbs[i].list); 283 282 return 0; 283 + } 284 + 285 + static void io_free_alloc_caches(struct io_ring_ctx *ctx) 286 + { 287 + io_alloc_cache_free(&ctx->apoll_cache, kfree); 288 + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 289 + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 290 + io_alloc_cache_free(&ctx->uring_cache, kfree); 291 + io_alloc_cache_free(&ctx->msg_cache, kfree); 292 + io_futex_cache_free(ctx); 293 + io_rsrc_cache_free(ctx); 284 294 } 285 295 286 296 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ··· 325 313 init_waitqueue_head(&ctx->sqo_sq_wait); 326 314 INIT_LIST_HEAD(&ctx->sqd_list); 327 315 INIT_LIST_HEAD(&ctx->cq_overflow_list); 328 - INIT_LIST_HEAD(&ctx->io_buffers_cache); 329 316 ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, 330 317 sizeof(struct async_poll), 0); 331 318 ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, ··· 339 328 ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, 340 329 sizeof(struct io_kiocb), 0); 341 330 ret |= io_futex_cache_init(ctx); 331 + ret |= io_rsrc_cache_init(ctx); 342 332 if (ret) 343 333 goto free_ref; 344 334 init_completion(&ctx->ref_comp); ··· 350 338 spin_lock_init(&ctx->completion_lock); 351 339 raw_spin_lock_init(&ctx->timeout_lock); 352 340 INIT_WQ_LIST(&ctx->iopoll_list); 353 - INIT_LIST_HEAD(&ctx->io_buffers_comp); 354 341 INIT_LIST_HEAD(&ctx->defer_list); 355 342 INIT_LIST_HEAD(&ctx->timeout_list); 356 343 INIT_LIST_HEAD(&ctx->ltimeout_list); ··· 371 360 free_ref: 372 361 percpu_ref_exit(&ctx->refs); 373 362 err: 374 - io_alloc_cache_free(&ctx->apoll_cache, kfree); 375 - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 376 - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 377 - io_alloc_cache_free(&ctx->uring_cache, kfree); 378 - io_alloc_cache_free(&ctx->msg_cache, kfree); 379 - io_futex_cache_free(ctx); 363 + io_free_alloc_caches(ctx); 380 364 kvfree(ctx->cancel_table.hbs); 381 365 xa_destroy(&ctx->io_bl_xa); 382 366 kfree(ctx); ··· 399 393 400 394 static void io_clean_op(struct io_kiocb *req) 401 395 { 402 - if (req->flags & REQ_F_BUFFER_SELECTED) { 403 - spin_lock(&req->ctx->completion_lock); 404 - io_kbuf_drop(req); 405 - spin_unlock(&req->ctx->completion_lock); 406 - } 396 + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) 397 + io_kbuf_drop_legacy(req); 407 398 408 399 if (req->flags & REQ_F_NEED_CLEANUP) { 409 400 const struct io_cold_def *def = &io_cold_defs[req->opcode]; ··· 545 542 io_queue_linked_timeout(link); 546 543 } 547 544 548 - static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) 545 + static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) 549 546 { 550 547 io_queue_iowq(req); 551 548 } ··· 902 899 * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires 903 900 * the submitter task context, IOPOLL protects with uring_lock. 904 901 */ 905 - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { 902 + if (ctx->lockless_cq) { 906 903 req->io_task_work.func = io_req_task_complete; 907 904 io_req_task_work_add(req); 908 905 return; ··· 1024 1021 return nxt; 1025 1022 } 1026 1023 1027 - static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) 1024 + static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) 1028 1025 { 1029 1026 if (!ctx) 1030 1027 return; ··· 1054 1051 io_task_work.node); 1055 1052 1056 1053 if (req->ctx != ctx) { 1057 - ctx_flush_and_put(ctx, &ts); 1054 + ctx_flush_and_put(ctx, ts); 1058 1055 ctx = req->ctx; 1059 1056 mutex_lock(&ctx->uring_lock); 1060 1057 percpu_ref_get(&ctx->refs); 1061 1058 } 1062 1059 INDIRECT_CALL_2(req->io_task_work.func, 1063 1060 io_poll_task_func, io_req_rw_complete, 1064 - req, &ts); 1061 + req, ts); 1065 1062 node = next; 1066 1063 (*count)++; 1067 1064 if (unlikely(need_resched())) { 1068 - ctx_flush_and_put(ctx, &ts); 1065 + ctx_flush_and_put(ctx, ts); 1069 1066 ctx = NULL; 1070 1067 cond_resched(); 1071 1068 } 1072 1069 } while (node && *count < max_entries); 1073 1070 1074 - ctx_flush_and_put(ctx, &ts); 1071 + ctx_flush_and_put(ctx, ts); 1075 1072 return node; 1076 1073 } 1077 1074 ··· 1160 1157 * We don't know how many reuqests is there in the link and whether 1161 1158 * they can even be queued lazily, fall back to non-lazy. 1162 1159 */ 1163 - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 1160 + if (req->flags & IO_REQ_LINK_FLAGS) 1164 1161 flags &= ~IOU_F_TWQ_LAZY_WAKE; 1165 1162 1166 1163 guard(rcu)(); ··· 1279 1276 } 1280 1277 1281 1278 static int __io_run_local_work_loop(struct llist_node **node, 1282 - struct io_tw_state *ts, 1279 + io_tw_token_t tw, 1283 1280 int events) 1284 1281 { 1285 1282 int ret = 0; ··· 1290 1287 io_task_work.node); 1291 1288 INDIRECT_CALL_2(req->io_task_work.func, 1292 1289 io_poll_task_func, io_req_rw_complete, 1293 - req, ts); 1290 + req, tw); 1294 1291 *node = next; 1295 1292 if (++ret >= events) 1296 1293 break; ··· 1299 1296 return ret; 1300 1297 } 1301 1298 1302 - static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, 1299 + static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, 1303 1300 int min_events, int max_events) 1304 1301 { 1305 1302 struct llist_node *node; ··· 1312 1309 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1313 1310 again: 1314 1311 min_events -= ret; 1315 - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); 1312 + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 1316 1313 if (ctx->retry_llist.first) 1317 1314 goto retry_done; 1318 1315 ··· 1321 1318 * running the pending items. 1322 1319 */ 1323 1320 node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 1324 - ret += __io_run_local_work_loop(&node, ts, max_events - ret); 1321 + ret += __io_run_local_work_loop(&node, tw, max_events - ret); 1325 1322 ctx->retry_llist.first = node; 1326 1323 loops++; 1327 1324 ··· 1343 1340 1344 1341 if (!io_local_work_pending(ctx)) 1345 1342 return 0; 1346 - return __io_run_local_work(ctx, &ts, min_events, 1343 + return __io_run_local_work(ctx, ts, min_events, 1347 1344 max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1348 1345 } 1349 1346 ··· 1354 1351 int ret; 1355 1352 1356 1353 mutex_lock(&ctx->uring_lock); 1357 - ret = __io_run_local_work(ctx, &ts, min_events, max_events); 1354 + ret = __io_run_local_work(ctx, ts, min_events, max_events); 1358 1355 mutex_unlock(&ctx->uring_lock); 1359 1356 return ret; 1360 1357 } 1361 1358 1362 - static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) 1359 + static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) 1363 1360 { 1364 - io_tw_lock(req->ctx, ts); 1361 + io_tw_lock(req->ctx, tw); 1365 1362 io_req_defer_failed(req, req->cqe.res); 1366 1363 } 1367 1364 1368 - void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) 1365 + void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) 1369 1366 { 1370 - io_tw_lock(req->ctx, ts); 1367 + io_tw_lock(req->ctx, tw); 1371 1368 if (unlikely(io_should_terminate_tw())) 1372 1369 io_req_defer_failed(req, -EFAULT); 1373 1370 else if (req->flags & REQ_F_FORCE_ASYNC) ··· 1422 1419 1423 1420 if (apoll->double_poll) 1424 1421 kfree(apoll->double_poll); 1425 - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) 1426 - kfree(apoll); 1422 + io_cache_free(&ctx->apoll_cache, apoll); 1427 1423 req->flags &= ~REQ_F_POLLED; 1428 1424 } 1429 1425 if (req->flags & IO_REQ_LINK_FLAGS) ··· 1584 1582 return 0; 1585 1583 } 1586 1584 1587 - void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) 1585 + void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) 1588 1586 { 1589 1587 io_req_complete_defer(req); 1590 1588 } ··· 1721 1719 return !!req->file; 1722 1720 } 1723 1721 1724 - static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 1722 + static inline int __io_issue_sqe(struct io_kiocb *req, 1723 + unsigned int issue_flags, 1724 + const struct io_issue_def *def) 1725 1725 { 1726 - const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1727 1726 const struct cred *creds = NULL; 1728 1727 int ret; 1729 - 1730 - if (unlikely(!io_assign_file(req, def, issue_flags))) 1731 - return -EBADF; 1732 1728 1733 1729 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) 1734 1730 creds = override_creds(req->creds); ··· 1741 1741 1742 1742 if (creds) 1743 1743 revert_creds(creds); 1744 + 1745 + return ret; 1746 + } 1747 + 1748 + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 1749 + { 1750 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1751 + int ret; 1752 + 1753 + if (unlikely(!io_assign_file(req, def, issue_flags))) 1754 + return -EBADF; 1755 + 1756 + ret = __io_issue_sqe(req, issue_flags, def); 1744 1757 1745 1758 if (ret == IOU_OK) { 1746 1759 if (issue_flags & IO_URING_F_COMPLETE_DEFER) ··· 1775 1762 return ret; 1776 1763 } 1777 1764 1778 - int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) 1765 + int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) 1779 1766 { 1780 - io_tw_lock(req->ctx, ts); 1781 - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| 1782 - IO_URING_F_COMPLETE_DEFER); 1767 + const unsigned int issue_flags = IO_URING_F_NONBLOCK | 1768 + IO_URING_F_MULTISHOT | 1769 + IO_URING_F_COMPLETE_DEFER; 1770 + int ret; 1771 + 1772 + io_tw_lock(req->ctx, tw); 1773 + 1774 + WARN_ON_ONCE(!req->file); 1775 + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) 1776 + return -EFAULT; 1777 + 1778 + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); 1779 + 1780 + WARN_ON_ONCE(ret == IOU_OK); 1781 + 1782 + if (ret == IOU_ISSUE_SKIP_COMPLETE) 1783 + ret = 0; 1784 + return ret; 1783 1785 } 1784 1786 1785 1787 struct io_wq_work *io_wq_free_work(struct io_wq_work *work) ··· 2024 1996 return true; 2025 1997 } 2026 1998 2027 - static void io_init_req_drain(struct io_kiocb *req) 1999 + static void io_init_drain(struct io_ring_ctx *ctx) 2028 2000 { 2029 - struct io_ring_ctx *ctx = req->ctx; 2030 2001 struct io_kiocb *head = ctx->submit_state.link.head; 2031 2002 2032 2003 ctx->drain_active = true; ··· 2089 2062 if (sqe_flags & IOSQE_IO_DRAIN) { 2090 2063 if (ctx->drain_disabled) 2091 2064 return io_init_fail_req(req, -EOPNOTSUPP); 2092 - io_init_req_drain(req); 2065 + io_init_drain(ctx); 2093 2066 } 2094 2067 } 2095 2068 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { ··· 2731 2704 io_sqe_files_unregister(ctx); 2732 2705 io_cqring_overflow_kill(ctx); 2733 2706 io_eventfd_unregister(ctx); 2734 - io_alloc_cache_free(&ctx->apoll_cache, kfree); 2735 - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 2736 - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); 2737 - io_alloc_cache_free(&ctx->uring_cache, kfree); 2738 - io_alloc_cache_free(&ctx->msg_cache, kfree); 2739 - io_futex_cache_free(ctx); 2707 + io_free_alloc_caches(ctx); 2740 2708 io_destroy_buffers(ctx); 2741 2709 io_free_region(ctx, &ctx->param_region); 2742 2710 mutex_unlock(&ctx->uring_lock); ··· 3559 3537 O_RDWR | O_CLOEXEC, NULL); 3560 3538 } 3561 3539 3540 + static int io_uring_sanitise_params(struct io_uring_params *p) 3541 + { 3542 + unsigned flags = p->flags; 3543 + 3544 + /* There is no way to mmap rings without a real fd */ 3545 + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && 3546 + !(flags & IORING_SETUP_NO_MMAP)) 3547 + return -EINVAL; 3548 + 3549 + if (flags & IORING_SETUP_SQPOLL) { 3550 + /* IPI related flags don't make sense with SQPOLL */ 3551 + if (flags & (IORING_SETUP_COOP_TASKRUN | 3552 + IORING_SETUP_TASKRUN_FLAG | 3553 + IORING_SETUP_DEFER_TASKRUN)) 3554 + return -EINVAL; 3555 + } 3556 + 3557 + if (flags & IORING_SETUP_TASKRUN_FLAG) { 3558 + if (!(flags & (IORING_SETUP_COOP_TASKRUN | 3559 + IORING_SETUP_DEFER_TASKRUN))) 3560 + return -EINVAL; 3561 + } 3562 + 3563 + /* HYBRID_IOPOLL only valid with IOPOLL */ 3564 + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) 3565 + return -EINVAL; 3566 + 3567 + /* 3568 + * For DEFER_TASKRUN we require the completion task to be the same as 3569 + * the submission task. This implies that there is only one submitter. 3570 + */ 3571 + if ((flags & IORING_SETUP_DEFER_TASKRUN) && 3572 + !(flags & IORING_SETUP_SINGLE_ISSUER)) 3573 + return -EINVAL; 3574 + 3575 + return 0; 3576 + } 3577 + 3562 3578 int io_uring_fill_params(unsigned entries, struct io_uring_params *p) 3563 3579 { 3564 3580 if (!entries) ··· 3606 3546 return -EINVAL; 3607 3547 entries = IORING_MAX_ENTRIES; 3608 3548 } 3609 - 3610 - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) 3611 - && !(p->flags & IORING_SETUP_NO_MMAP)) 3612 - return -EINVAL; 3613 3549 3614 3550 /* 3615 3551 * Use twice as many entries for the CQ ring. It's possible for the ··· 3668 3612 struct file *file; 3669 3613 int ret; 3670 3614 3615 + ret = io_uring_sanitise_params(p); 3616 + if (ret) 3617 + return ret; 3618 + 3671 3619 ret = io_uring_fill_params(entries, p); 3672 3620 if (unlikely(ret)) 3673 3621 return ret; ··· 3719 3659 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if 3720 3660 * COOP_TASKRUN is set, then IPIs are never needed by the app. 3721 3661 */ 3722 - ret = -EINVAL; 3723 - if (ctx->flags & IORING_SETUP_SQPOLL) { 3724 - /* IPI related flags don't make sense with SQPOLL */ 3725 - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | 3726 - IORING_SETUP_TASKRUN_FLAG | 3727 - IORING_SETUP_DEFER_TASKRUN)) 3728 - goto err; 3662 + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) 3729 3663 ctx->notify_method = TWA_SIGNAL_NO_IPI; 3730 - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { 3731 - ctx->notify_method = TWA_SIGNAL_NO_IPI; 3732 - } else { 3733 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && 3734 - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 3735 - goto err; 3664 + else 3736 3665 ctx->notify_method = TWA_SIGNAL; 3737 - } 3738 - 3739 - /* HYBRID_IOPOLL only valid with IOPOLL */ 3740 - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == 3741 - IORING_SETUP_HYBRID_IOPOLL) 3742 - goto err; 3743 - 3744 - /* 3745 - * For DEFER_TASKRUN we require the completion task to be the same as the 3746 - * submission task. This implies that there is only one submitter, so enforce 3747 - * that. 3748 - */ 3749 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && 3750 - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { 3751 - goto err; 3752 - } 3753 3666 3754 3667 /* 3755 3668 * This is just grabbed for accounting purposes. When a process exits, ··· 3941 3908 3942 3909 io_uring_optable_init(); 3943 3910 3911 + /* imu->dir is u8 */ 3912 + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); 3913 + 3944 3914 /* 3945 3915 * Allow user copy in the per-command field, which starts after the 3946 3916 * file in io_kiocb and until the opcode field. The openat2 handling ··· 3954 3918 req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, 3955 3919 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | 3956 3920 SLAB_TYPESAFE_BY_RCU); 3957 - io_buf_cachep = KMEM_CACHE(io_buffer, 3958 - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); 3959 3921 3960 3922 iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); 3923 + BUG_ON(!iou_wq); 3961 3924 3962 3925 #ifdef CONFIG_SYSCTL 3963 3926 register_sysctl_init("kernel", kernel_io_uring_disabled_table);
+9 -5
io_uring/io_uring.h
··· 90 90 unsigned flags); 91 91 bool io_alloc_async_data(struct io_kiocb *req); 92 92 void io_req_task_queue(struct io_kiocb *req); 93 - void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); 93 + void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); 94 94 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 95 - void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); 95 + void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); 96 96 struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 97 97 struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 98 98 void tctx_task_work(struct callback_head *cb); ··· 104 104 int start, int end); 105 105 void io_req_queue_iowq(struct io_kiocb *req); 106 106 107 - int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); 107 + int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); 108 108 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 109 109 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 110 110 void __io_submit_flush_completions(struct io_ring_ctx *ctx); ··· 145 145 lockdep_assert(current == ctx->submitter_task); 146 146 } 147 147 #endif 148 + } 149 + 150 + static inline bool io_is_compat(struct io_ring_ctx *ctx) 151 + { 152 + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); 148 153 } 149 154 150 155 static inline void io_req_task_work_add(struct io_kiocb *req) ··· 381 376 return task_work_pending(current) || io_local_work_pending(ctx); 382 377 } 383 378 384 - static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) 379 + static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) 385 380 { 386 381 lockdep_assert_held(&ctx->uring_lock); 387 382 } ··· 423 418 } 424 419 425 420 extern struct kmem_cache *req_cachep; 426 - extern struct kmem_cache *io_buf_cachep; 427 421 428 422 static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) 429 423 {
+76 -96
io_uring/kbuf.c
··· 20 20 /* BIDs are addressed by a 16-bit field in a CQE */ 21 21 #define MAX_BIDS_PER_BGID (1 << 16) 22 22 23 - struct kmem_cache *io_buf_cachep; 23 + /* Mapped buffer ring, return io_uring_buf from head */ 24 + #define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] 24 25 25 26 struct io_provide_buf { 26 27 struct file *file; ··· 31 30 __u32 nbufs; 32 31 __u16 bid; 33 32 }; 33 + 34 + bool io_kbuf_commit(struct io_kiocb *req, 35 + struct io_buffer_list *bl, int len, int nr) 36 + { 37 + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) 38 + return true; 39 + 40 + req->flags &= ~REQ_F_BUFFERS_COMMIT; 41 + 42 + if (unlikely(len < 0)) 43 + return true; 44 + 45 + if (bl->flags & IOBL_INC) { 46 + struct io_uring_buf *buf; 47 + 48 + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); 49 + if (WARN_ON_ONCE(len > buf->len)) 50 + len = buf->len; 51 + buf->len -= len; 52 + if (buf->len) { 53 + buf->addr += len; 54 + return false; 55 + } 56 + } 57 + 58 + bl->head += nr; 59 + return true; 60 + } 34 61 35 62 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 36 63 unsigned int bgid) ··· 81 52 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); 82 53 } 83 54 55 + void io_kbuf_drop_legacy(struct io_kiocb *req) 56 + { 57 + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) 58 + return; 59 + req->buf_index = req->kbuf->bgid; 60 + req->flags &= ~REQ_F_BUFFER_SELECTED; 61 + kfree(req->kbuf); 62 + req->kbuf = NULL; 63 + } 64 + 84 65 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) 85 66 { 86 67 struct io_ring_ctx *ctx = req->ctx; ··· 107 68 108 69 io_ring_submit_unlock(ctx, issue_flags); 109 70 return true; 110 - } 111 - 112 - void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) 113 - { 114 - /* 115 - * We can add this buffer back to two lists: 116 - * 117 - * 1) The io_buffers_cache list. This one is protected by the 118 - * ctx->uring_lock. If we already hold this lock, add back to this 119 - * list as we can grab it from issue as well. 120 - * 2) The io_buffers_comp list. This one is protected by the 121 - * ctx->completion_lock. 122 - * 123 - * We migrate buffers from the comp_list to the issue cache list 124 - * when we need one. 125 - */ 126 - if (issue_flags & IO_URING_F_UNLOCKED) { 127 - struct io_ring_ctx *ctx = req->ctx; 128 - 129 - spin_lock(&ctx->completion_lock); 130 - __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); 131 - spin_unlock(&ctx->completion_lock); 132 - } else { 133 - lockdep_assert_held(&req->ctx->uring_lock); 134 - 135 - __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); 136 - } 137 71 } 138 72 139 73 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, ··· 354 342 return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); 355 343 } 356 344 345 + static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 346 + { 347 + struct io_buffer_list *bl = req->buf_list; 348 + bool ret = true; 349 + 350 + if (bl) { 351 + ret = io_kbuf_commit(req, bl, len, nr); 352 + req->buf_index = bl->bgid; 353 + } 354 + req->flags &= ~REQ_F_BUFFER_RING; 355 + return ret; 356 + } 357 + 358 + unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) 359 + { 360 + unsigned int ret; 361 + 362 + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 363 + 364 + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { 365 + io_kbuf_drop_legacy(req); 366 + return ret; 367 + } 368 + 369 + if (!__io_put_kbuf_ring(req, len, nbufs)) 370 + ret |= IORING_CQE_F_BUF_MORE; 371 + return ret; 372 + } 373 + 357 374 static int __io_remove_buffers(struct io_ring_ctx *ctx, 358 375 struct io_buffer_list *bl, unsigned nbufs) 359 376 { ··· 408 367 struct io_buffer *nxt; 409 368 410 369 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); 411 - list_move(&nxt->list, &ctx->io_buffers_cache); 370 + list_del(&nxt->list); 371 + kfree(nxt); 372 + 412 373 if (++i == nbufs) 413 374 return i; 414 375 cond_resched(); ··· 428 385 void io_destroy_buffers(struct io_ring_ctx *ctx) 429 386 { 430 387 struct io_buffer_list *bl; 431 - struct list_head *item, *tmp; 432 - struct io_buffer *buf; 433 388 434 389 while (1) { 435 390 unsigned long index = 0; ··· 440 399 if (!bl) 441 400 break; 442 401 io_put_bl(ctx, bl); 443 - } 444 - 445 - /* 446 - * Move deferred locked entries to cache before pruning 447 - */ 448 - spin_lock(&ctx->completion_lock); 449 - if (!list_empty(&ctx->io_buffers_comp)) 450 - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); 451 - spin_unlock(&ctx->completion_lock); 452 - 453 - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { 454 - buf = list_entry(item, struct io_buffer, list); 455 - kmem_cache_free(io_buf_cachep, buf); 456 402 } 457 403 } 458 404 ··· 529 501 return 0; 530 502 } 531 503 532 - #define IO_BUFFER_ALLOC_BATCH 64 533 - 534 - static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 535 - { 536 - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; 537 - int allocated; 538 - 539 - /* 540 - * Completions that don't happen inline (eg not under uring_lock) will 541 - * add to ->io_buffers_comp. If we don't have any free buffers, check 542 - * the completion list and splice those entries first. 543 - */ 544 - if (!list_empty_careful(&ctx->io_buffers_comp)) { 545 - spin_lock(&ctx->completion_lock); 546 - if (!list_empty(&ctx->io_buffers_comp)) { 547 - list_splice_init(&ctx->io_buffers_comp, 548 - &ctx->io_buffers_cache); 549 - spin_unlock(&ctx->completion_lock); 550 - return 0; 551 - } 552 - spin_unlock(&ctx->completion_lock); 553 - } 554 - 555 - /* 556 - * No free buffers and no completion entries either. Allocate a new 557 - * batch of buffer entries and add those to our freelist. 558 - */ 559 - 560 - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, 561 - ARRAY_SIZE(bufs), (void **) bufs); 562 - if (unlikely(!allocated)) { 563 - /* 564 - * Bulk alloc is all-or-nothing. If we fail to get a batch, 565 - * retry single alloc to be on the safe side. 566 - */ 567 - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); 568 - if (!bufs[0]) 569 - return -ENOMEM; 570 - allocated = 1; 571 - } 572 - 573 - while (allocated) 574 - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); 575 - 576 - return 0; 577 - } 578 - 579 504 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, 580 505 struct io_buffer_list *bl) 581 506 { ··· 537 556 int i, bid = pbuf->bid; 538 557 539 558 for (i = 0; i < pbuf->nbufs; i++) { 540 - if (list_empty(&ctx->io_buffers_cache) && 541 - io_refill_buffer_cache(ctx)) 559 + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 560 + if (!buf) 542 561 break; 543 - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, 544 - list); 545 - list_move_tail(&buf->list, &bl->buf_list); 562 + 563 + list_add_tail(&buf->list, &bl->buf_list); 546 564 buf->addr = addr; 547 565 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 548 566 buf->bid = bid;
+11 -89
io_uring/kbuf.h
··· 74 74 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 75 75 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); 76 76 77 - void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); 78 - 79 77 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); 78 + void io_kbuf_drop_legacy(struct io_kiocb *req); 79 + 80 + unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); 81 + bool io_kbuf_commit(struct io_kiocb *req, 82 + struct io_buffer_list *bl, int len, int nr); 80 83 81 84 struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, 82 85 unsigned int bgid); ··· 119 116 return false; 120 117 } 121 118 122 - /* Mapped buffer ring, return io_uring_buf from head */ 123 - #define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] 124 - 125 - static inline bool io_kbuf_commit(struct io_kiocb *req, 126 - struct io_buffer_list *bl, int len, int nr) 127 - { 128 - if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) 129 - return true; 130 - 131 - req->flags &= ~REQ_F_BUFFERS_COMMIT; 132 - 133 - if (unlikely(len < 0)) 134 - return true; 135 - 136 - if (bl->flags & IOBL_INC) { 137 - struct io_uring_buf *buf; 138 - 139 - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); 140 - if (WARN_ON_ONCE(len > buf->len)) 141 - len = buf->len; 142 - buf->len -= len; 143 - if (buf->len) { 144 - buf->addr += len; 145 - return false; 146 - } 147 - } 148 - 149 - bl->head += nr; 150 - return true; 151 - } 152 - 153 - static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 154 - { 155 - struct io_buffer_list *bl = req->buf_list; 156 - bool ret = true; 157 - 158 - if (bl) { 159 - ret = io_kbuf_commit(req, bl, len, nr); 160 - req->buf_index = bl->bgid; 161 - } 162 - req->flags &= ~REQ_F_BUFFER_RING; 163 - return ret; 164 - } 165 - 166 - static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, 167 - struct list_head *list) 168 - { 169 - if (req->flags & REQ_F_BUFFER_RING) { 170 - __io_put_kbuf_ring(req, len, 1); 171 - } else { 172 - req->buf_index = req->kbuf->bgid; 173 - list_add(&req->kbuf->list, list); 174 - req->flags &= ~REQ_F_BUFFER_SELECTED; 175 - } 176 - } 177 - 178 - static inline void io_kbuf_drop(struct io_kiocb *req) 179 - { 180 - lockdep_assert_held(&req->ctx->completion_lock); 181 - 182 - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) 183 - return; 184 - 185 - /* len == 0 is fine here, non-ring will always drop all of it */ 186 - __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); 187 - } 188 - 189 - static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, 190 - int nbufs, unsigned issue_flags) 191 - { 192 - unsigned int ret; 193 - 194 - if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 195 - return 0; 196 - 197 - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); 198 - if (req->flags & REQ_F_BUFFER_RING) { 199 - if (!__io_put_kbuf_ring(req, len, nbufs)) 200 - ret |= IORING_CQE_F_BUF_MORE; 201 - } else { 202 - __io_put_kbuf(req, len, issue_flags); 203 - } 204 - return ret; 205 - } 206 - 207 119 static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, 208 120 unsigned issue_flags) 209 121 { 210 - return __io_put_kbufs(req, len, 1, issue_flags); 122 + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 123 + return 0; 124 + return __io_put_kbufs(req, len, 1); 211 125 } 212 126 213 127 static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, 214 128 int nbufs, unsigned issue_flags) 215 129 { 216 - return __io_put_kbufs(req, len, nbufs, issue_flags); 130 + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 131 + return 0; 132 + return __io_put_kbufs(req, len, nbufs); 217 133 } 218 134 #endif
+1 -1
io_uring/msg_ring.c
··· 71 71 return target_ctx->task_complete; 72 72 } 73 73 74 - static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) 74 + static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) 75 75 { 76 76 struct io_ring_ctx *ctx = req->ctx; 77 77
+114 -143
io_uring/net.c
··· 75 75 u16 flags; 76 76 /* initialised and used only by !msg send variants */ 77 77 u16 buf_group; 78 - u16 buf_index; 78 + bool retry; 79 79 void __user *msg_control; 80 80 /* used only for send zerocopy */ 81 81 struct io_kiocb *notif; ··· 187 187 188 188 req->flags &= ~REQ_F_BL_EMPTY; 189 189 sr->done_io = 0; 190 + sr->retry = false; 190 191 sr->len = 0; /* get from the provided buffer */ 191 192 req->buf_index = sr->buf_group; 192 193 } 193 194 194 - #ifdef CONFIG_COMPAT 195 - static int io_compat_msg_copy_hdr(struct io_kiocb *req, 196 - struct io_async_msghdr *iomsg, 197 - struct compat_msghdr *msg, int ddir) 195 + static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, 196 + const struct iovec __user *uiov, unsigned uvec_seg, 197 + int ddir) 198 198 { 199 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 200 - struct compat_iovec __user *uiov; 201 199 struct iovec *iov; 202 200 int ret, nr_segs; 203 201 ··· 203 205 nr_segs = iomsg->free_iov_nr; 204 206 iov = iomsg->free_iov; 205 207 } else { 206 - iov = &iomsg->fast_iov; 207 208 nr_segs = 1; 209 + iov = &iomsg->fast_iov; 208 210 } 211 + 212 + ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, 213 + &iomsg->msg.msg_iter, io_is_compat(req->ctx)); 214 + if (unlikely(ret < 0)) 215 + return ret; 216 + io_net_vec_assign(req, iomsg, iov); 217 + return 0; 218 + } 219 + 220 + static int io_compat_msg_copy_hdr(struct io_kiocb *req, 221 + struct io_async_msghdr *iomsg, 222 + struct compat_msghdr *msg, int ddir, 223 + struct sockaddr __user **save_addr) 224 + { 225 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 226 + struct compat_iovec __user *uiov; 227 + int ret; 209 228 210 229 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 211 230 return -EFAULT; 212 231 232 + ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); 233 + if (ret) 234 + return ret; 235 + 213 236 uiov = compat_ptr(msg->msg_iov); 214 237 if (req->flags & REQ_F_BUFFER_SELECT) { 215 - compat_ssize_t clen; 216 - 217 238 if (msg->msg_iovlen == 0) { 218 - sr->len = iov->iov_len = 0; 219 - iov->iov_base = NULL; 239 + sr->len = 0; 220 240 } else if (msg->msg_iovlen > 1) { 221 241 return -EINVAL; 222 242 } else { 223 - if (!access_ok(uiov, sizeof(*uiov))) 243 + struct compat_iovec tmp_iov; 244 + 245 + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 224 246 return -EFAULT; 225 - if (__get_user(clen, &uiov->iov_len)) 226 - return -EFAULT; 227 - if (clen < 0) 228 - return -EINVAL; 229 - sr->len = clen; 247 + sr->len = tmp_iov.iov_len; 230 248 } 231 249 232 250 return 0; 233 251 } 234 252 235 - ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, 236 - nr_segs, &iov, &iomsg->msg.msg_iter, true); 237 - if (unlikely(ret < 0)) 238 - return ret; 239 - 240 - io_net_vec_assign(req, iomsg, iov); 241 - return 0; 253 + return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, 254 + msg->msg_iovlen, ddir); 242 255 } 243 - #endif 244 256 245 - static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 246 - struct user_msghdr *msg, int ddir) 257 + static int io_copy_msghdr_from_user(struct user_msghdr *msg, 258 + struct user_msghdr __user *umsg) 247 259 { 248 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 249 - struct user_msghdr __user *umsg = sr->umsg; 250 - struct iovec *iov; 251 - int ret, nr_segs; 252 - 253 - if (iomsg->free_iov) { 254 - nr_segs = iomsg->free_iov_nr; 255 - iov = iomsg->free_iov; 256 - } else { 257 - iov = &iomsg->fast_iov; 258 - nr_segs = 1; 259 - } 260 - 261 260 if (!user_access_begin(umsg, sizeof(*umsg))) 262 261 return -EFAULT; 263 - 264 - ret = -EFAULT; 265 262 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 266 263 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 267 264 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 268 265 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 269 266 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 270 267 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 268 + user_access_end(); 269 + return 0; 270 + ua_end: 271 + user_access_end(); 272 + return -EFAULT; 273 + } 274 + 275 + static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 276 + struct user_msghdr *msg, int ddir, 277 + struct sockaddr __user **save_addr) 278 + { 279 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 280 + struct user_msghdr __user *umsg = sr->umsg; 281 + int ret; 282 + 283 + ret = io_copy_msghdr_from_user(msg, umsg); 284 + if (unlikely(ret)) 285 + return ret; 286 + 271 287 msg->msg_flags = 0; 288 + 289 + ret = __copy_msghdr(&iomsg->msg, msg, save_addr); 290 + if (ret) 291 + return ret; 272 292 273 293 if (req->flags & REQ_F_BUFFER_SELECT) { 274 294 if (msg->msg_iovlen == 0) { 275 - sr->len = iov->iov_len = 0; 276 - iov->iov_base = NULL; 295 + sr->len = 0; 277 296 } else if (msg->msg_iovlen > 1) { 278 - ret = -EINVAL; 279 - goto ua_end; 297 + return -EINVAL; 280 298 } else { 281 299 struct iovec __user *uiov = msg->msg_iov; 300 + struct iovec tmp_iov; 282 301 283 - /* we only need the length for provided buffers */ 284 - if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) 285 - goto ua_end; 286 - unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); 287 - sr->len = iov->iov_len; 302 + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 303 + return -EFAULT; 304 + sr->len = tmp_iov.iov_len; 288 305 } 289 - ret = 0; 290 - ua_end: 291 - user_access_end(); 292 - return ret; 306 + return 0; 293 307 } 294 308 295 - user_access_end(); 296 - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, 297 - &iov, &iomsg->msg.msg_iter, false); 298 - if (unlikely(ret < 0)) 299 - return ret; 300 - 301 - io_net_vec_assign(req, iomsg, iov); 302 - return 0; 309 + return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir); 303 310 } 304 311 305 312 static int io_sendmsg_copy_hdr(struct io_kiocb *req, ··· 317 314 iomsg->msg.msg_name = &iomsg->addr; 318 315 iomsg->msg.msg_iter.nr_segs = 0; 319 316 320 - #ifdef CONFIG_COMPAT 321 - if (unlikely(req->ctx->compat)) { 317 + if (io_is_compat(req->ctx)) { 322 318 struct compat_msghdr cmsg; 323 319 324 - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); 325 - if (unlikely(ret)) 326 - return ret; 327 - 328 - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); 320 + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE, 321 + NULL); 329 322 sr->msg_control = iomsg->msg.msg_control_user; 330 323 return ret; 331 324 } 332 - #endif 333 325 334 - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); 335 - if (unlikely(ret)) 336 - return ret; 337 - 338 - ret = __copy_msghdr(&iomsg->msg, &msg, NULL); 339 - 326 + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); 340 327 /* save msg_control as sys_sendmsg() overwrites it */ 341 328 sr->msg_control = iomsg->msg.msg_control_user; 342 329 return ret; ··· 380 387 { 381 388 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 382 389 struct io_async_msghdr *kmsg = req->async_data; 383 - int ret; 384 390 385 391 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 386 392 387 - ret = io_sendmsg_copy_hdr(req, kmsg); 388 - if (!ret) 389 - req->flags |= REQ_F_NEED_CLEANUP; 390 - return ret; 393 + return io_sendmsg_copy_hdr(req, kmsg); 391 394 } 392 395 393 396 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) ··· 393 404 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 394 405 395 406 sr->done_io = 0; 407 + sr->retry = false; 396 408 397 409 if (req->opcode != IORING_OP_SEND) { 398 410 if (sqe->addr2 || sqe->file_index) ··· 417 427 req->buf_list = NULL; 418 428 } 419 429 420 - #ifdef CONFIG_COMPAT 421 - if (req->ctx->compat) 430 + if (io_is_compat(req->ctx)) 422 431 sr->msg_flags |= MSG_CMSG_COMPAT; 423 - #endif 432 + 424 433 if (unlikely(!io_msg_alloc_async(req))) 425 434 return -ENOMEM; 426 435 if (req->opcode != IORING_OP_SENDMSG) ··· 704 715 iomsg->msg.msg_name = &iomsg->addr; 705 716 iomsg->msg.msg_iter.nr_segs = 0; 706 717 707 - #ifdef CONFIG_COMPAT 708 - if (unlikely(req->ctx->compat)) { 718 + if (io_is_compat(req->ctx)) { 709 719 struct compat_msghdr cmsg; 710 720 711 - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); 712 - if (unlikely(ret)) 713 - return ret; 714 - 715 - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); 716 - if (unlikely(ret)) 717 - return ret; 718 - 719 - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, 720 - cmsg.msg_controllen); 721 + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, 722 + &iomsg->uaddr); 723 + memset(&msg, 0, sizeof(msg)); 724 + msg.msg_namelen = cmsg.msg_namelen; 725 + msg.msg_controllen = cmsg.msg_controllen; 726 + } else { 727 + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); 721 728 } 722 - #endif 723 729 724 - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); 725 730 if (unlikely(ret)) 726 731 return ret; 727 - 728 - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); 729 - if (unlikely(ret)) 730 - return ret; 731 - 732 732 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 733 733 msg.msg_controllen); 734 734 } ··· 751 773 return 0; 752 774 } 753 775 754 - ret = io_recvmsg_copy_hdr(req, kmsg); 755 - if (!ret) 756 - req->flags |= REQ_F_NEED_CLEANUP; 757 - return ret; 776 + return io_recvmsg_copy_hdr(req, kmsg); 758 777 } 759 778 760 779 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ ··· 762 787 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 763 788 764 789 sr->done_io = 0; 790 + sr->retry = false; 765 791 766 792 if (unlikely(sqe->file_index || sqe->addr2)) 767 793 return -EINVAL; ··· 803 827 return -EINVAL; 804 828 } 805 829 806 - #ifdef CONFIG_COMPAT 807 - if (req->ctx->compat) 830 + if (io_is_compat(req->ctx)) 808 831 sr->msg_flags |= MSG_CMSG_COMPAT; 809 - #endif 832 + 810 833 sr->nr_multishot_loops = 0; 811 834 return io_recvmsg_prep_setup(req); 812 835 } 836 + 837 + /* bits to clear in old and inherit in new cflags on bundle retry */ 838 + #define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) 813 839 814 840 /* 815 841 * Finishes io_recv and io_recvmsg. ··· 832 854 if (sr->flags & IORING_RECVSEND_BUNDLE) { 833 855 cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), 834 856 issue_flags); 857 + if (sr->retry) 858 + cflags = req->cqe.flags | (cflags & CQE_F_MASK); 835 859 /* bundle with no more immediate buffers, we're done */ 836 860 if (req->flags & REQ_F_BL_EMPTY) 837 861 goto finish; 862 + /* if more is available, retry and append to this one */ 863 + if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { 864 + req->cqe.flags = cflags & ~CQE_F_MASK; 865 + sr->len = kmsg->msg.msg_inq; 866 + sr->done_io += *ret; 867 + sr->retry = true; 868 + return false; 869 + } 838 870 } else { 839 871 cflags |= io_put_kbuf(req, *ret, issue_flags); 840 872 } ··· 1223 1235 struct io_kiocb *notif; 1224 1236 1225 1237 zc->done_io = 0; 1238 + zc->retry = false; 1226 1239 req->flags |= REQ_F_POLL_NO_LAZY; 1227 1240 1228 1241 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) ··· 1262 1273 1263 1274 zc->len = READ_ONCE(sqe->len); 1264 1275 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1265 - zc->buf_index = READ_ONCE(sqe->buf_index); 1276 + req->buf_index = READ_ONCE(sqe->buf_index); 1266 1277 if (zc->msg_flags & MSG_DONTWAIT) 1267 1278 req->flags |= REQ_F_NOWAIT; 1268 1279 1269 - #ifdef CONFIG_COMPAT 1270 - if (req->ctx->compat) 1280 + if (io_is_compat(req->ctx)) 1271 1281 zc->msg_flags |= MSG_CMSG_COMPAT; 1272 - #endif 1282 + 1273 1283 if (unlikely(!io_msg_alloc_async(req))) 1274 1284 return -ENOMEM; 1275 1285 if (req->opcode != IORING_OP_SENDMSG_ZC) ··· 1333 1345 int ret; 1334 1346 1335 1347 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 1336 - struct io_ring_ctx *ctx = req->ctx; 1337 - struct io_rsrc_node *node; 1338 - 1339 - ret = -EFAULT; 1340 - io_ring_submit_lock(ctx, issue_flags); 1341 - node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); 1342 - if (node) { 1343 - io_req_assign_buf_node(sr->notif, node); 1344 - ret = 0; 1345 - } 1346 - io_ring_submit_unlock(ctx, issue_flags); 1347 - 1348 - if (unlikely(ret)) 1349 - return ret; 1350 - 1351 - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, 1352 - node->buf, (u64)(uintptr_t)sr->buf, 1353 - sr->len); 1348 + sr->notif->buf_index = req->buf_index; 1349 + ret = io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1350 + (u64)(uintptr_t)sr->buf, sr->len, 1351 + ITER_SOURCE, issue_flags); 1354 1352 if (unlikely(ret)) 1355 1353 return ret; 1356 1354 kmsg->msg.sg_from_iter = io_sg_from_iter; ··· 1573 1599 } 1574 1600 if (ret == -ERESTARTSYS) 1575 1601 ret = -EINTR; 1576 - req_set_fail(req); 1577 1602 } else if (!fixed) { 1578 1603 fd_install(fd, file); 1579 1604 ret = fd; ··· 1585 1612 if (!arg.is_empty) 1586 1613 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1587 1614 1588 - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1589 - io_req_set_res(req, ret, cflags); 1590 - return IOU_OK; 1591 - } 1592 - 1593 - if (ret < 0) 1594 - return ret; 1595 - if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1615 + if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && 1616 + io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1596 1617 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1597 1618 goto retry; 1598 1619 if (issue_flags & IO_URING_F_MULTISHOT) ··· 1595 1628 } 1596 1629 1597 1630 io_req_set_res(req, ret, cflags); 1631 + if (ret < 0) 1632 + req_set_fail(req); 1633 + if (!(issue_flags & IO_URING_F_MULTISHOT)) 1634 + return IOU_OK; 1598 1635 return IOU_STOP_MULTISHOT; 1599 1636 } 1600 1637
+3 -15
io_uring/nop.c
··· 16 16 struct file *file; 17 17 int result; 18 18 int fd; 19 - int buffer; 20 19 unsigned int flags; 21 20 }; 22 21 ··· 39 40 else 40 41 nop->fd = -1; 41 42 if (nop->flags & IORING_NOP_FIXED_BUFFER) 42 - nop->buffer = READ_ONCE(sqe->buf_index); 43 - else 44 - nop->buffer = -1; 43 + req->buf_index = READ_ONCE(sqe->buf_index); 45 44 return 0; 46 45 } 47 46 ··· 61 64 } 62 65 } 63 66 if (nop->flags & IORING_NOP_FIXED_BUFFER) { 64 - struct io_ring_ctx *ctx = req->ctx; 65 - struct io_rsrc_node *node; 66 - 67 - ret = -EFAULT; 68 - io_ring_submit_lock(ctx, issue_flags); 69 - node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); 70 - if (node) { 71 - io_req_assign_buf_node(req, node); 72 - ret = 0; 73 - } 74 - io_ring_submit_unlock(ctx, issue_flags); 67 + if (!io_find_buf_node(req, issue_flags)) 68 + ret = -EFAULT; 75 69 } 76 70 done: 77 71 if (ret < 0)
+2 -2
io_uring/notif.c
··· 11 11 12 12 static const struct ubuf_info_ops io_ubuf_ops; 13 13 14 - static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) 14 + static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) 15 15 { 16 16 struct io_notif_data *nd = io_notif_to_data(notif); 17 17 ··· 29 29 } 30 30 31 31 nd = nd->next; 32 - io_req_task_complete(notif, ts); 32 + io_req_task_complete(notif, tw); 33 33 } while (nd); 34 34 } 35 35
+2 -2
io_uring/opdef.c
··· 104 104 .iopoll_queue = 1, 105 105 .async_size = sizeof(struct io_async_rw), 106 106 .prep = io_prep_read_fixed, 107 - .issue = io_read, 107 + .issue = io_read_fixed, 108 108 }, 109 109 [IORING_OP_WRITE_FIXED] = { 110 110 .needs_file = 1, ··· 118 118 .iopoll_queue = 1, 119 119 .async_size = sizeof(struct io_async_rw), 120 120 .prep = io_prep_write_fixed, 121 - .issue = io_write, 121 + .issue = io_write_fixed, 122 122 }, 123 123 [IORING_OP_POLL_ADD] = { 124 124 .needs_file = 1,
+6 -6
io_uring/opdef.h
··· 7 7 unsigned needs_file : 1; 8 8 /* should block plug */ 9 9 unsigned plug : 1; 10 + /* supports ioprio */ 11 + unsigned ioprio : 1; 12 + /* supports iopoll */ 13 + unsigned iopoll : 1; 14 + /* op supports buffer selection */ 15 + unsigned buffer_select : 1; 10 16 /* hash wq insertion if file is a regular file */ 11 17 unsigned hash_reg_file : 1; 12 18 /* unbound wq insertion if file is a non-regular file */ ··· 21 15 unsigned pollin : 1; 22 16 unsigned pollout : 1; 23 17 unsigned poll_exclusive : 1; 24 - /* op supports buffer selection */ 25 - unsigned buffer_select : 1; 26 18 /* skip auditing */ 27 19 unsigned audit_skip : 1; 28 - /* supports ioprio */ 29 - unsigned ioprio : 1; 30 - /* supports iopoll */ 31 - unsigned iopoll : 1; 32 20 /* have to be put into the iopoll list */ 33 21 unsigned iopoll_queue : 1; 34 22 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
+9 -9
io_uring/poll.c
··· 220 220 * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot 221 221 * poll and that the result is stored in req->cqe. 222 222 */ 223 - static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) 223 + static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) 224 224 { 225 225 int v; 226 226 ··· 288 288 return IOU_POLL_REMOVE_POLL_USE_RES; 289 289 } 290 290 } else { 291 - int ret = io_poll_issue(req, ts); 291 + int ret = io_poll_issue(req, tw); 292 292 if (ret == IOU_STOP_MULTISHOT) 293 293 return IOU_POLL_REMOVE_POLL_USE_RES; 294 294 else if (ret == IOU_REQUEUE) ··· 311 311 return IOU_POLL_NO_ACTION; 312 312 } 313 313 314 - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) 314 + void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) 315 315 { 316 316 int ret; 317 317 318 - ret = io_poll_check_events(req, ts); 318 + ret = io_poll_check_events(req, tw); 319 319 if (ret == IOU_POLL_NO_ACTION) { 320 320 io_kbuf_recycle(req, 0); 321 321 return; ··· 335 335 poll = io_kiocb_to_cmd(req, struct io_poll); 336 336 req->cqe.res = mangle_poll(req->cqe.res & poll->events); 337 337 } else if (ret == IOU_POLL_REISSUE) { 338 - io_req_task_submit(req, ts); 338 + io_req_task_submit(req, tw); 339 339 return; 340 340 } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { 341 341 req->cqe.res = ret; ··· 343 343 } 344 344 345 345 io_req_set_res(req, req->cqe.res, 0); 346 - io_req_task_complete(req, ts); 346 + io_req_task_complete(req, tw); 347 347 } else { 348 - io_tw_lock(req->ctx, ts); 348 + io_tw_lock(req->ctx, tw); 349 349 350 350 if (ret == IOU_POLL_REMOVE_POLL_USE_RES) 351 - io_req_task_complete(req, ts); 351 + io_req_task_complete(req, tw); 352 352 else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) 353 - io_req_task_submit(req, ts); 353 + io_req_task_submit(req, tw); 354 354 else 355 355 io_req_defer_failed(req, ret); 356 356 }
+3 -1
io_uring/poll.h
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + #include <linux/io_uring_types.h> 4 + 3 5 #define IO_POLL_ALLOC_CACHE_MAX 32 4 6 5 7 enum { ··· 45 43 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 46 44 bool cancel_all); 47 45 48 - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); 46 + void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw);
+213 -33
io_uring/rsrc.c
··· 9 9 #include <linux/hugetlb.h> 10 10 #include <linux/compat.h> 11 11 #include <linux/io_uring.h> 12 + #include <linux/io_uring/cmd.h> 12 13 13 14 #include <uapi/linux/io_uring.h> 14 15 ··· 32 31 /* only define max */ 33 32 #define IORING_MAX_FIXED_FILES (1U << 20) 34 33 #define IORING_MAX_REG_BUFFERS (1U << 14) 34 + 35 + #define IO_CACHED_BVECS_SEGS 32 35 36 36 37 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 37 38 { ··· 104 101 return 0; 105 102 } 106 103 107 - static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 104 + static void io_release_ubuf(void *priv) 108 105 { 106 + struct io_mapped_ubuf *imu = priv; 109 107 unsigned int i; 110 108 111 - if (node->buf) { 112 - struct io_mapped_ubuf *imu = node->buf; 113 - 114 - if (!refcount_dec_and_test(&imu->refs)) 115 - return; 116 - for (i = 0; i < imu->nr_bvecs; i++) 117 - unpin_user_page(imu->bvec[i].bv_page); 118 - if (imu->acct_pages) 119 - io_unaccount_mem(ctx, imu->acct_pages); 120 - kvfree(imu); 121 - } 109 + for (i = 0; i < imu->nr_bvecs; i++) 110 + unpin_user_page(imu->bvec[i].bv_page); 122 111 } 123 112 124 - struct io_rsrc_node *io_rsrc_node_alloc(int type) 113 + static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 114 + int nr_bvecs) 115 + { 116 + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 117 + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 118 + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 119 + GFP_KERNEL); 120 + } 121 + 122 + static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 123 + { 124 + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 125 + io_cache_free(&ctx->imu_cache, imu); 126 + else 127 + kvfree(imu); 128 + } 129 + 130 + static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 131 + { 132 + if (!refcount_dec_and_test(&imu->refs)) 133 + return; 134 + 135 + if (imu->acct_pages) 136 + io_unaccount_mem(ctx, imu->acct_pages); 137 + imu->release(imu->priv); 138 + io_free_imu(ctx, imu); 139 + } 140 + 141 + struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 125 142 { 126 143 struct io_rsrc_node *node; 127 144 128 - node = kzalloc(sizeof(*node), GFP_KERNEL); 145 + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 129 146 if (node) { 130 147 node->type = type; 131 148 node->refs = 1; 149 + node->tag = 0; 150 + node->file_ptr = 0; 132 151 } 133 152 return node; 134 153 } 135 154 136 - __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) 155 + bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 156 + { 157 + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 158 + IO_CACHED_BVECS_SEGS); 159 + const int node_size = sizeof(struct io_rsrc_node); 160 + bool ret; 161 + 162 + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 163 + node_size, 0); 164 + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 165 + imu_cache_size, 0); 166 + return ret; 167 + } 168 + 169 + void io_rsrc_cache_free(struct io_ring_ctx *ctx) 170 + { 171 + io_alloc_cache_free(&ctx->node_cache, kfree); 172 + io_alloc_cache_free(&ctx->imu_cache, kfree); 173 + } 174 + 175 + __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 176 + struct io_rsrc_data *data) 137 177 { 138 178 if (!data->nr) 139 179 return; ··· 249 203 err = -EBADF; 250 204 break; 251 205 } 252 - node = io_rsrc_node_alloc(IORING_RSRC_FILE); 206 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 253 207 if (!node) { 254 208 err = -ENOMEM; 255 209 fput(file); ··· 495 449 496 450 switch (node->type) { 497 451 case IORING_RSRC_FILE: 498 - if (io_slot_file(node)) 499 - fput(io_slot_file(node)); 452 + fput(io_slot_file(node)); 500 453 break; 501 454 case IORING_RSRC_BUFFER: 502 - if (node->buf) 503 - io_buffer_unmap(ctx, node); 455 + io_buffer_unmap(ctx, node->buf); 504 456 break; 505 457 default: 506 458 WARN_ON_ONCE(1); 507 459 break; 508 460 } 509 461 510 - kfree(node); 462 + io_cache_free(&ctx->node_cache, node); 511 463 } 512 464 513 465 int io_sqe_files_unregister(struct io_ring_ctx *ctx) ··· 567 523 goto fail; 568 524 } 569 525 ret = -ENOMEM; 570 - node = io_rsrc_node_alloc(IORING_RSRC_FILE); 526 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 571 527 if (!node) { 572 528 fput(file); 573 529 goto fail; ··· 772 728 if (!iov->iov_base) 773 729 return NULL; 774 730 775 - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); 731 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 776 732 if (!node) 777 733 return ERR_PTR(-ENOMEM); 778 - node->buf = NULL; 779 734 780 735 ret = -ENOMEM; 781 736 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, ··· 791 748 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 792 749 } 793 750 794 - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 751 + imu = io_alloc_imu(ctx, nr_pages); 795 752 if (!imu) 796 753 goto done; 797 754 755 + imu->nr_bvecs = nr_pages; 798 756 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 799 757 if (ret) { 800 758 unpin_user_pages(pages, nr_pages); ··· 806 762 /* store original address for later verification */ 807 763 imu->ubuf = (unsigned long) iov->iov_base; 808 764 imu->len = iov->iov_len; 809 - imu->nr_bvecs = nr_pages; 810 765 imu->folio_shift = PAGE_SHIFT; 766 + imu->release = io_release_ubuf; 767 + imu->priv = imu; 768 + imu->is_kbuf = false; 769 + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 811 770 if (coalesced) 812 771 imu->folio_shift = data.folio_shift; 813 772 refcount_set(&imu->refs, 1); ··· 828 781 } 829 782 done: 830 783 if (ret) { 831 - kvfree(imu); 832 - if (node) 833 - io_put_rsrc_node(ctx, node); 784 + if (imu) 785 + io_free_imu(ctx, imu); 786 + io_cache_free(&ctx->node_cache, node); 834 787 node = ERR_PTR(ret); 835 788 } 836 789 kvfree(pages); ··· 907 860 return ret; 908 861 } 909 862 910 - int io_import_fixed(int ddir, struct iov_iter *iter, 863 + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 864 + void (*release)(void *), unsigned int index, 865 + unsigned int issue_flags) 866 + { 867 + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 868 + struct io_rsrc_data *data = &ctx->buf_table; 869 + struct req_iterator rq_iter; 870 + struct io_mapped_ubuf *imu; 871 + struct io_rsrc_node *node; 872 + struct bio_vec bv, *bvec; 873 + u16 nr_bvecs; 874 + int ret = 0; 875 + 876 + io_ring_submit_lock(ctx, issue_flags); 877 + if (index >= data->nr) { 878 + ret = -EINVAL; 879 + goto unlock; 880 + } 881 + index = array_index_nospec(index, data->nr); 882 + 883 + if (data->nodes[index]) { 884 + ret = -EBUSY; 885 + goto unlock; 886 + } 887 + 888 + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 889 + if (!node) { 890 + ret = -ENOMEM; 891 + goto unlock; 892 + } 893 + 894 + nr_bvecs = blk_rq_nr_phys_segments(rq); 895 + imu = io_alloc_imu(ctx, nr_bvecs); 896 + if (!imu) { 897 + kfree(node); 898 + ret = -ENOMEM; 899 + goto unlock; 900 + } 901 + 902 + imu->ubuf = 0; 903 + imu->len = blk_rq_bytes(rq); 904 + imu->acct_pages = 0; 905 + imu->folio_shift = PAGE_SHIFT; 906 + imu->nr_bvecs = nr_bvecs; 907 + refcount_set(&imu->refs, 1); 908 + imu->release = release; 909 + imu->priv = rq; 910 + imu->is_kbuf = true; 911 + imu->dir = 1 << rq_data_dir(rq); 912 + 913 + bvec = imu->bvec; 914 + rq_for_each_bvec(bv, rq, rq_iter) 915 + *bvec++ = bv; 916 + 917 + node->buf = imu; 918 + data->nodes[index] = node; 919 + unlock: 920 + io_ring_submit_unlock(ctx, issue_flags); 921 + return ret; 922 + } 923 + EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 924 + 925 + int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 926 + unsigned int issue_flags) 927 + { 928 + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 929 + struct io_rsrc_data *data = &ctx->buf_table; 930 + struct io_rsrc_node *node; 931 + int ret = 0; 932 + 933 + io_ring_submit_lock(ctx, issue_flags); 934 + if (index >= data->nr) { 935 + ret = -EINVAL; 936 + goto unlock; 937 + } 938 + index = array_index_nospec(index, data->nr); 939 + 940 + node = data->nodes[index]; 941 + if (!node) { 942 + ret = -EINVAL; 943 + goto unlock; 944 + } 945 + if (!node->buf->is_kbuf) { 946 + ret = -EBUSY; 947 + goto unlock; 948 + } 949 + 950 + io_put_rsrc_node(ctx, node); 951 + data->nodes[index] = NULL; 952 + unlock: 953 + io_ring_submit_unlock(ctx, issue_flags); 954 + return ret; 955 + } 956 + EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 957 + 958 + static int io_import_fixed(int ddir, struct iov_iter *iter, 911 959 struct io_mapped_ubuf *imu, 912 960 u64 buf_addr, size_t len) 913 961 { ··· 1016 874 /* not inside the mapped region */ 1017 875 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1018 876 return -EFAULT; 877 + if (!(imu->dir & (1 << ddir))) 878 + return -EFAULT; 1019 879 1020 880 /* 1021 881 * Might not be a start of buffer, set size appropriately ··· 1030 886 /* 1031 887 * Don't use iov_iter_advance() here, as it's really slow for 1032 888 * using the latter parts of a big fixed buffer - it iterates 1033 - * over each segment manually. We can cheat a bit here, because 1034 - * we know that: 889 + * over each segment manually. We can cheat a bit here for user 890 + * registered nodes, because we know that: 1035 891 * 1036 892 * 1) it's a BVEC iter, we set it up 1037 893 * 2) all bvecs are the same in size, except potentially the ··· 1045 901 */ 1046 902 const struct bio_vec *bvec = imu->bvec; 1047 903 904 + /* 905 + * Kernel buffer bvecs, on the other hand, don't necessarily 906 + * have the size property of user registered ones, so we have 907 + * to use the slow iter advance. 908 + */ 1048 909 if (offset < bvec->bv_len) { 1049 910 iter->iov_offset = offset; 911 + } else if (imu->is_kbuf) { 912 + iov_iter_advance(iter, offset); 1050 913 } else { 1051 914 unsigned long seg_skip; 1052 915 ··· 1068 917 } 1069 918 1070 919 return 0; 920 + } 921 + 922 + inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 923 + unsigned issue_flags) 924 + { 925 + struct io_ring_ctx *ctx = req->ctx; 926 + struct io_rsrc_node *node; 927 + 928 + if (req->flags & REQ_F_BUF_NODE) 929 + return req->buf_node; 930 + 931 + io_ring_submit_lock(ctx, issue_flags); 932 + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 933 + if (node) 934 + io_req_assign_buf_node(req, node); 935 + io_ring_submit_unlock(ctx, issue_flags); 936 + return node; 937 + } 938 + 939 + int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 940 + u64 buf_addr, size_t len, int ddir, 941 + unsigned issue_flags) 942 + { 943 + struct io_rsrc_node *node; 944 + 945 + node = io_find_buf_node(req, issue_flags); 946 + if (!node) 947 + return -EFAULT; 948 + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1071 949 } 1072 950 1073 951 /* Lock two rings at once. The rings must be different! */ ··· 1182 1002 if (!src_node) { 1183 1003 dst_node = NULL; 1184 1004 } else { 1185 - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); 1005 + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1186 1006 if (!dst_node) { 1187 1007 ret = -ENOMEM; 1188 1008 goto out_free;
+19 -5
io_uring/rsrc.h
··· 2 2 #ifndef IOU_RSRC_H 3 3 #define IOU_RSRC_H 4 4 5 + #include <linux/io_uring_types.h> 5 6 #include <linux/lockdep.h> 6 7 7 8 enum { ··· 21 20 }; 22 21 }; 23 22 23 + enum { 24 + IO_IMU_DEST = 1 << ITER_DEST, 25 + IO_IMU_SOURCE = 1 << ITER_SOURCE, 26 + }; 27 + 24 28 struct io_mapped_ubuf { 25 29 u64 ubuf; 26 30 unsigned int len; ··· 33 27 unsigned int folio_shift; 34 28 refcount_t refs; 35 29 unsigned long acct_pages; 30 + void (*release)(void *); 31 + void *priv; 32 + bool is_kbuf; 33 + u8 dir; 36 34 struct bio_vec bvec[] __counted_by(nr_bvecs); 37 35 }; 38 36 ··· 49 39 unsigned int nr_folios; 50 40 }; 51 41 52 - struct io_rsrc_node *io_rsrc_node_alloc(int type); 42 + bool io_rsrc_cache_init(struct io_ring_ctx *ctx); 43 + void io_rsrc_cache_free(struct io_ring_ctx *ctx); 44 + struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); 53 45 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); 54 46 void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); 55 47 int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); 56 48 57 - int io_import_fixed(int ddir, struct iov_iter *iter, 58 - struct io_mapped_ubuf *imu, 59 - u64 buf_addr, size_t len); 49 + struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 50 + unsigned issue_flags); 51 + int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 52 + u64 buf_addr, size_t len, int ddir, 53 + unsigned issue_flags); 60 54 61 55 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); 62 56 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); ··· 91 77 static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 92 78 { 93 79 lockdep_assert_held(&ctx->uring_lock); 94 - if (node && !--node->refs) 80 + if (!--node->refs) 95 81 io_free_rsrc_node(ctx, node); 96 82 } 97 83
+108 -92
io_uring/rw.c
··· 49 49 return false; 50 50 } 51 51 52 - #ifdef CONFIG_COMPAT 53 52 static int io_iov_compat_buffer_select_prep(struct io_rw *rw) 54 53 { 55 - struct compat_iovec __user *uiov; 56 - compat_ssize_t clen; 54 + struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); 55 + struct compat_iovec iov; 57 56 58 - uiov = u64_to_user_ptr(rw->addr); 59 - if (!access_ok(uiov, sizeof(*uiov))) 57 + if (copy_from_user(&iov, uiov, sizeof(iov))) 60 58 return -EFAULT; 61 - if (__get_user(clen, &uiov->iov_len)) 62 - return -EFAULT; 63 - if (clen < 0) 64 - return -EINVAL; 65 - 66 - rw->len = clen; 59 + rw->len = iov.iov_len; 67 60 return 0; 68 61 } 69 - #endif 70 62 71 63 static int io_iov_buffer_select_prep(struct io_kiocb *req) 72 64 { ··· 69 77 if (rw->len != 1) 70 78 return -EINVAL; 71 79 72 - #ifdef CONFIG_COMPAT 73 - if (req->ctx->compat) 80 + if (io_is_compat(req->ctx)) 74 81 return io_iov_compat_buffer_select_prep(rw); 75 - #endif 76 82 77 83 uiov = u64_to_user_ptr(rw->addr); 78 84 if (copy_from_user(&iov, uiov, sizeof(*uiov))) ··· 79 89 return 0; 80 90 } 81 91 82 - static int __io_import_iovec(int ddir, struct io_kiocb *req, 83 - struct io_async_rw *io, 84 - unsigned int issue_flags) 92 + static int io_import_vec(int ddir, struct io_kiocb *req, 93 + struct io_async_rw *io, 94 + const struct iovec __user *uvec, 95 + size_t uvec_segs) 85 96 { 86 - const struct io_issue_def *def = &io_issue_defs[req->opcode]; 87 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 97 + int ret, nr_segs; 88 98 struct iovec *iov; 89 - void __user *buf; 90 - int nr_segs, ret; 91 - size_t sqe_len; 92 - 93 - buf = u64_to_user_ptr(rw->addr); 94 - sqe_len = rw->len; 95 - 96 - if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { 97 - if (io_do_buffer_select(req)) { 98 - buf = io_buffer_select(req, &sqe_len, issue_flags); 99 - if (!buf) 100 - return -ENOBUFS; 101 - rw->addr = (unsigned long) buf; 102 - rw->len = sqe_len; 103 - } 104 - 105 - return import_ubuf(ddir, buf, sqe_len, &io->iter); 106 - } 107 99 108 100 if (io->free_iovec) { 109 101 nr_segs = io->free_iov_nr; 110 102 iov = io->free_iovec; 111 103 } else { 112 - iov = &io->fast_iov; 113 104 nr_segs = 1; 105 + iov = &io->fast_iov; 114 106 } 115 - ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, 116 - req->ctx->compat); 107 + 108 + ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, 109 + io_is_compat(req->ctx)); 117 110 if (unlikely(ret < 0)) 118 111 return ret; 119 112 if (iov) { ··· 108 135 return 0; 109 136 } 110 137 111 - static inline int io_import_iovec(int rw, struct io_kiocb *req, 112 - struct io_async_rw *io, 113 - unsigned int issue_flags) 138 + static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, 139 + struct io_async_rw *io, 140 + unsigned int issue_flags) 141 + { 142 + const struct io_issue_def *def = &io_issue_defs[req->opcode]; 143 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 144 + void __user *buf = u64_to_user_ptr(rw->addr); 145 + size_t sqe_len = rw->len; 146 + 147 + if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) 148 + return io_import_vec(ddir, req, io, buf, sqe_len); 149 + 150 + if (io_do_buffer_select(req)) { 151 + buf = io_buffer_select(req, &sqe_len, issue_flags); 152 + if (!buf) 153 + return -ENOBUFS; 154 + rw->addr = (unsigned long) buf; 155 + rw->len = sqe_len; 156 + } 157 + return import_ubuf(ddir, buf, sqe_len, &io->iter); 158 + } 159 + 160 + static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, 161 + struct io_async_rw *io, 162 + unsigned int issue_flags) 114 163 { 115 164 int ret; 116 165 117 - ret = __io_import_iovec(rw, req, io, issue_flags); 166 + ret = __io_import_rw_buffer(rw, req, io, issue_flags); 118 167 if (unlikely(ret < 0)) 119 168 return ret; 120 169 ··· 207 212 return 0; 208 213 } 209 214 210 - static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) 211 - { 212 - struct io_async_rw *rw; 213 - 214 - if (io_rw_alloc_async(req)) 215 - return -ENOMEM; 216 - 217 - if (!do_import || io_do_buffer_select(req)) 218 - return 0; 219 - 220 - rw = req->async_data; 221 - return io_import_iovec(ddir, req, rw, 0); 222 - } 223 - 224 215 static inline void io_meta_save_state(struct io_async_rw *io) 225 216 { 226 217 io->meta_state.seed = io->meta.seed; ··· 248 267 return ret; 249 268 } 250 269 251 - static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 252 - int ddir, bool do_import) 270 + static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 271 + int ddir) 253 272 { 254 273 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 255 274 unsigned ioprio; 256 275 u64 attr_type_mask; 257 276 int ret; 277 + 278 + if (io_rw_alloc_async(req)) 279 + return -ENOMEM; 258 280 259 281 rw->kiocb.ki_pos = READ_ONCE(sqe->off); 260 282 /* used for fixed read/write too - just read unconditionally */ ··· 284 300 rw->addr = READ_ONCE(sqe->addr); 285 301 rw->len = READ_ONCE(sqe->len); 286 302 rw->flags = READ_ONCE(sqe->rw_flags); 287 - ret = io_prep_rw_setup(req, ddir, do_import); 288 - 289 - if (unlikely(ret)) 290 - return ret; 291 303 292 304 attr_type_mask = READ_ONCE(sqe->attr_type_mask); 293 305 if (attr_type_mask) { ··· 294 314 return -EINVAL; 295 315 296 316 attr_ptr = READ_ONCE(sqe->attr_ptr); 297 - ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); 317 + return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); 298 318 } 299 - return ret; 319 + return 0; 320 + } 321 + 322 + static int io_rw_do_import(struct io_kiocb *req, int ddir) 323 + { 324 + if (io_do_buffer_select(req)) 325 + return 0; 326 + 327 + return io_import_rw_buffer(ddir, req, req->async_data, 0); 328 + } 329 + 330 + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 331 + int ddir) 332 + { 333 + int ret; 334 + 335 + ret = __io_prep_rw(req, sqe, ddir); 336 + if (unlikely(ret)) 337 + return ret; 338 + 339 + return io_rw_do_import(req, ddir); 300 340 } 301 341 302 342 int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) 303 343 { 304 - return io_prep_rw(req, sqe, ITER_DEST, true); 344 + return io_prep_rw(req, sqe, ITER_DEST); 305 345 } 306 346 307 347 int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) 308 348 { 309 - return io_prep_rw(req, sqe, ITER_SOURCE, true); 349 + return io_prep_rw(req, sqe, ITER_SOURCE); 310 350 } 311 351 312 352 static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, 313 353 int ddir) 314 354 { 315 - const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); 316 355 int ret; 317 356 318 - ret = io_prep_rw(req, sqe, ddir, do_import); 357 + ret = io_prep_rw(req, sqe, ddir); 319 358 if (unlikely(ret)) 320 359 return ret; 321 - if (do_import) 360 + if (!(req->flags & REQ_F_BUFFER_SELECT)) 322 361 return 0; 323 362 324 363 /* ··· 357 358 return io_prep_rwv(req, sqe, ITER_SOURCE); 358 359 } 359 360 360 - static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, 361 + static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, 361 362 int ddir) 362 363 { 363 364 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 364 - struct io_ring_ctx *ctx = req->ctx; 365 - struct io_rsrc_node *node; 366 - struct io_async_rw *io; 365 + struct io_async_rw *io = req->async_data; 367 366 int ret; 368 367 369 - ret = io_prep_rw(req, sqe, ddir, false); 370 - if (unlikely(ret)) 371 - return ret; 368 + if (io->bytes_done) 369 + return 0; 372 370 373 - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 374 - if (!node) 375 - return -EFAULT; 376 - io_req_assign_buf_node(req, node); 377 - 378 - io = req->async_data; 379 - ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); 371 + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, 372 + issue_flags); 380 373 iov_iter_save_state(&io->iter, &io->iter_state); 381 374 return ret; 382 375 } 383 376 384 377 int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 385 378 { 386 - return io_prep_rw_fixed(req, sqe, ITER_DEST); 379 + return __io_prep_rw(req, sqe, ITER_DEST); 387 380 } 388 381 389 382 int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) 390 383 { 391 - return io_prep_rw_fixed(req, sqe, ITER_SOURCE); 384 + return __io_prep_rw(req, sqe, ITER_SOURCE); 392 385 } 393 386 394 387 /* ··· 396 405 if (!(req->flags & REQ_F_BUFFER_SELECT)) 397 406 return -EINVAL; 398 407 399 - ret = io_prep_rw(req, sqe, ITER_DEST, false); 408 + ret = __io_prep_rw(req, sqe, ITER_DEST); 400 409 if (unlikely(ret)) 401 410 return ret; 402 411 ··· 510 519 return res; 511 520 } 512 521 513 - void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) 522 + void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) 514 523 { 515 524 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 516 525 struct kiocb *kiocb = &rw->kiocb; ··· 527 536 req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); 528 537 529 538 io_req_rw_cleanup(req, 0); 530 - io_req_task_complete(req, ts); 539 + io_req_task_complete(req, tw); 531 540 } 532 541 533 542 static void io_complete_rw(struct kiocb *kiocb, long res) ··· 629 638 */ 630 639 static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) 631 640 { 641 + struct io_kiocb *req = cmd_to_io_kiocb(rw); 632 642 struct kiocb *kiocb = &rw->kiocb; 633 643 struct file *file = kiocb->ki_filp; 634 644 ssize_t ret = 0; ··· 645 653 if ((kiocb->ki_flags & IOCB_NOWAIT) && 646 654 !(kiocb->ki_filp->f_flags & O_NONBLOCK)) 647 655 return -EAGAIN; 656 + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) 657 + return -EFAULT; 648 658 649 659 ppos = io_kiocb_ppos(kiocb); 650 660 ··· 858 864 loff_t *ppos; 859 865 860 866 if (io_do_buffer_select(req)) { 861 - ret = io_import_iovec(ITER_DEST, req, io, issue_flags); 867 + ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); 862 868 if (unlikely(ret < 0)) 863 869 return ret; 864 870 } ··· 1147 1153 io_req_end_write(req); 1148 1154 return -EAGAIN; 1149 1155 } 1156 + } 1157 + 1158 + int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) 1159 + { 1160 + int ret; 1161 + 1162 + ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); 1163 + if (unlikely(ret)) 1164 + return ret; 1165 + 1166 + return io_read(req, issue_flags); 1167 + } 1168 + 1169 + int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) 1170 + { 1171 + int ret; 1172 + 1173 + ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); 1174 + if (unlikely(ret)) 1175 + return ret; 1176 + 1177 + return io_write(req, issue_flags); 1150 1178 } 1151 1179 1152 1180 void io_rw_fail(struct io_kiocb *req)
+4 -1
io_uring/rw.h
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + #include <linux/io_uring_types.h> 3 4 #include <linux/pagemap.h> 4 5 5 6 struct io_meta_state { ··· 38 37 int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); 39 38 int io_read(struct io_kiocb *req, unsigned int issue_flags); 40 39 int io_write(struct io_kiocb *req, unsigned int issue_flags); 40 + int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); 41 + int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); 41 42 void io_readv_writev_cleanup(struct io_kiocb *req); 42 43 void io_rw_fail(struct io_kiocb *req); 43 - void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); 44 + void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); 44 45 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 45 46 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); 46 47 void io_rw_cache_free(const void *entry);
+2 -1
io_uring/splice.c
··· 51 51 { 52 52 struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); 53 53 54 - io_put_rsrc_node(req->ctx, sp->rsrc_node); 54 + if (sp->rsrc_node) 55 + io_put_rsrc_node(req->ctx, sp->rsrc_node); 55 56 } 56 57 57 58 static struct file *io_splice_get_file(struct io_kiocb *req,
+8 -8
io_uring/timeout.c
··· 65 65 66 66 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); 67 67 68 - static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) 68 + static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) 69 69 { 70 70 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 71 71 struct io_timeout_data *data = req->async_data; ··· 82 82 } 83 83 } 84 84 85 - io_req_task_complete(req, ts); 85 + io_req_task_complete(req, tw); 86 86 } 87 87 88 88 static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) ··· 154 154 io_flush_killed_timeouts(&list, 0); 155 155 } 156 156 157 - static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) 157 + static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) 158 158 { 159 - io_tw_lock(link->ctx, ts); 159 + io_tw_lock(link->ctx, tw); 160 160 while (link) { 161 161 struct io_kiocb *nxt = link->link; 162 162 long res = -ECANCELED; ··· 165 165 res = link->cqe.res; 166 166 link->link = NULL; 167 167 io_req_set_res(link, res, 0); 168 - io_req_task_complete(link, ts); 168 + io_req_task_complete(link, tw); 169 169 link = nxt; 170 170 } 171 171 } ··· 312 312 return 0; 313 313 } 314 314 315 - static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) 315 + static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) 316 316 { 317 317 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 318 318 struct io_kiocb *prev = timeout->prev; ··· 330 330 ret = -ECANCELED; 331 331 } 332 332 io_req_set_res(req, ret ?: -ETIME, 0); 333 - io_req_task_complete(req, ts); 333 + io_req_task_complete(req, tw); 334 334 io_put_req(prev); 335 335 } else { 336 336 io_req_set_res(req, -ETIME, 0); 337 - io_req_task_complete(req, ts); 337 + io_req_task_complete(req, tw); 338 338 } 339 339 } 340 340
+8 -23
io_uring/uring_cmd.c
··· 102 102 } 103 103 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); 104 104 105 - static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) 105 + static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) 106 106 { 107 107 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 108 108 unsigned int flags = IO_URING_F_COMPLETE_DEFER; ··· 199 199 if (ioucmd->flags & ~IORING_URING_CMD_MASK) 200 200 return -EINVAL; 201 201 202 - if (ioucmd->flags & IORING_URING_CMD_FIXED) { 203 - struct io_ring_ctx *ctx = req->ctx; 204 - struct io_rsrc_node *node; 205 - u16 index = READ_ONCE(sqe->buf_index); 202 + if (ioucmd->flags & IORING_URING_CMD_FIXED) 203 + req->buf_index = READ_ONCE(sqe->buf_index); 206 204 207 - node = io_rsrc_node_lookup(&ctx->buf_table, index); 208 - if (unlikely(!node)) 209 - return -EFAULT; 210 - /* 211 - * Pi node upfront, prior to io_uring_cmd_import_fixed() 212 - * being called. This prevents destruction of the mapped buffer 213 - * we'll need at actual import time. 214 - */ 215 - io_req_assign_buf_node(req, node); 216 - } 217 205 ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); 218 206 219 207 return io_uring_cmd_prep_setup(req, sqe); ··· 225 237 issue_flags |= IO_URING_F_SQE128; 226 238 if (ctx->flags & IORING_SETUP_CQE32) 227 239 issue_flags |= IO_URING_F_CQE32; 228 - if (ctx->compat) 240 + if (io_is_compat(ctx)) 229 241 issue_flags |= IO_URING_F_COMPAT; 230 242 if (ctx->flags & IORING_SETUP_IOPOLL) { 231 243 if (!file->f_op->uring_cmd_iopoll) ··· 245 257 } 246 258 247 259 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, 248 - struct iov_iter *iter, void *ioucmd) 260 + struct iov_iter *iter, 261 + struct io_uring_cmd *ioucmd, 262 + unsigned int issue_flags) 249 263 { 250 264 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 251 - struct io_rsrc_node *node = req->buf_node; 252 265 253 - /* Must have had rsrc_node assigned at prep time */ 254 - if (node) 255 - return io_import_fixed(rw, iter, node->buf, ubuf, len); 256 - 257 - return -EFAULT; 266 + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); 258 267 } 259 268 EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); 260 269
+8 -48
io_uring/waitid.c
··· 16 16 #include "waitid.h" 17 17 #include "../kernel/exit.h" 18 18 19 - static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); 19 + static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); 20 20 21 21 #define IO_WAITID_CANCEL_FLAG BIT(31) 22 22 #define IO_WAITID_REF_MASK GENMASK(30, 0) ··· 42 42 req->flags &= ~REQ_F_ASYNC_DATA; 43 43 } 44 44 45 - #ifdef CONFIG_COMPAT 46 45 static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) 47 46 { 48 47 struct compat_siginfo __user *infop; ··· 66 67 ret = false; 67 68 goto done; 68 69 } 69 - #endif 70 70 71 71 static bool io_waitid_copy_si(struct io_kiocb *req, int signo) 72 72 { ··· 75 77 if (!iw->infop) 76 78 return true; 77 79 78 - #ifdef CONFIG_COMPAT 79 - if (req->ctx->compat) 80 + if (io_is_compat(req->ctx)) 80 81 return io_waitid_compat_copy_si(iw, signo); 81 - #endif 82 82 83 83 if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) 84 84 return false; ··· 128 132 io_req_set_res(req, ret, 0); 129 133 } 130 134 131 - static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 135 + static bool __io_waitid_cancel(struct io_kiocb *req) 132 136 { 133 137 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 134 138 struct io_waitid_async *iwa = req->async_data; ··· 154 158 int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 155 159 unsigned int issue_flags) 156 160 { 157 - struct hlist_node *tmp; 158 - struct io_kiocb *req; 159 - int nr = 0; 160 - 161 - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) 162 - return -ENOENT; 163 - 164 - io_ring_submit_lock(ctx, issue_flags); 165 - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 166 - if (req->cqe.user_data != cd->data && 167 - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) 168 - continue; 169 - if (__io_waitid_cancel(ctx, req)) 170 - nr++; 171 - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 172 - break; 173 - } 174 - io_ring_submit_unlock(ctx, issue_flags); 175 - 176 - if (nr) 177 - return nr; 178 - 179 - return -ENOENT; 161 + return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); 180 162 } 181 163 182 164 bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 183 165 bool cancel_all) 184 166 { 185 - struct hlist_node *tmp; 186 - struct io_kiocb *req; 187 - bool found = false; 188 - 189 - lockdep_assert_held(&ctx->uring_lock); 190 - 191 - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 192 - if (!io_match_task_safe(req, tctx, cancel_all)) 193 - continue; 194 - hlist_del_init(&req->hash_node); 195 - __io_waitid_cancel(ctx, req); 196 - found = true; 197 - } 198 - 199 - return found; 167 + return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); 200 168 } 201 169 202 170 static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) ··· 181 221 return true; 182 222 } 183 223 184 - static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) 224 + static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) 185 225 { 186 226 struct io_waitid_async *iwa = req->async_data; 187 227 struct io_ring_ctx *ctx = req->ctx; 188 228 int ret; 189 229 190 - io_tw_lock(ctx, ts); 230 + io_tw_lock(ctx, tw); 191 231 192 232 ret = __do_wait(&iwa->wo); 193 233 ··· 217 257 } 218 258 219 259 io_waitid_complete(req, ret); 220 - io_req_task_complete(req, ts); 260 + io_req_task_complete(req, tw); 221 261 } 222 262 223 263 static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+1
tools/testing/selftests/Makefile
··· 113 113 TARGETS += tmpfs 114 114 TARGETS += tpm2 115 115 TARGETS += tty 116 + TARGETS += ublk 116 117 TARGETS += uevent 117 118 TARGETS += user_events 118 119 TARGETS += vDSO
+3
tools/testing/selftests/ublk/.gitignore
··· 1 + kublk 2 + /tools 3 + *-verify.state
+16
tools/testing/selftests/ublk/Makefile
··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) 4 + LDLIBS += -lpthread -lm -luring 5 + 6 + TEST_PROGS := test_null_01.sh 7 + TEST_PROGS += test_loop_01.sh 8 + TEST_PROGS += test_loop_02.sh 9 + TEST_PROGS += test_loop_03.sh 10 + TEST_PROGS += test_loop_04.sh 11 + 12 + TEST_GEN_PROGS_EXTENDED = kublk 13 + 14 + include ../lib.mk 15 + 16 + $(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c
+1
tools/testing/selftests/ublk/config
··· 1 + CONFIG_BLK_DEV_UBLK=m
+220
tools/testing/selftests/ublk/file_backed.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "kublk.h" 4 + 5 + static void backing_file_tgt_deinit(struct ublk_dev *dev) 6 + { 7 + int i; 8 + 9 + for (i = 1; i < dev->nr_fds; i++) { 10 + fsync(dev->fds[i]); 11 + close(dev->fds[i]); 12 + } 13 + } 14 + 15 + static int backing_file_tgt_init(struct ublk_dev *dev) 16 + { 17 + int fd, i; 18 + 19 + assert(dev->nr_fds == 1); 20 + 21 + for (i = 0; i < dev->tgt.nr_backing_files; i++) { 22 + char *file = dev->tgt.backing_file[i]; 23 + unsigned long bytes; 24 + struct stat st; 25 + 26 + ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); 27 + 28 + fd = open(file, O_RDWR | O_DIRECT); 29 + if (fd < 0) { 30 + ublk_err("%s: backing file %s can't be opened: %s\n", 31 + __func__, file, strerror(errno)); 32 + return -EBADF; 33 + } 34 + 35 + if (fstat(fd, &st) < 0) { 36 + close(fd); 37 + return -EBADF; 38 + } 39 + 40 + if (S_ISREG(st.st_mode)) 41 + bytes = st.st_size; 42 + else if (S_ISBLK(st.st_mode)) { 43 + if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) 44 + return -1; 45 + } else { 46 + return -EINVAL; 47 + } 48 + 49 + dev->tgt.backing_file_size[i] = bytes; 50 + dev->fds[dev->nr_fds] = fd; 51 + dev->nr_fds += 1; 52 + } 53 + 54 + return 0; 55 + } 56 + 57 + static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int zc) 58 + { 59 + unsigned ublk_op = ublksrv_get_op(iod); 60 + 61 + if (ublk_op == UBLK_IO_OP_READ) 62 + return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; 63 + else if (ublk_op == UBLK_IO_OP_WRITE) 64 + return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; 65 + assert(0); 66 + } 67 + 68 + static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) 69 + { 70 + int zc = ublk_queue_use_zc(q); 71 + enum io_uring_op op = ublk_to_uring_op(iod, zc); 72 + struct io_uring_sqe *reg; 73 + struct io_uring_sqe *rw; 74 + struct io_uring_sqe *ureg; 75 + 76 + if (!zc) { 77 + rw = ublk_queue_alloc_sqe(q); 78 + if (!rw) 79 + return -ENOMEM; 80 + 81 + io_uring_prep_rw(op, rw, 1 /*fds[1]*/, 82 + (void *)iod->addr, 83 + iod->nr_sectors << 9, 84 + iod->start_sector << 9); 85 + io_uring_sqe_set_flags(rw, IOSQE_FIXED_FILE); 86 + q->io_inflight++; 87 + /* bit63 marks us as tgt io */ 88 + rw->user_data = build_user_data(tag, op, UBLK_IO_TGT_NORMAL, 1); 89 + return 0; 90 + } 91 + 92 + ublk_queue_alloc_sqe3(q, &reg, &rw, &ureg); 93 + 94 + io_uring_prep_buf_register(reg, 0, tag, q->q_id, tag); 95 + reg->user_data = build_user_data(tag, 0xfe, 1, 1); 96 + reg->flags |= IOSQE_CQE_SKIP_SUCCESS; 97 + reg->flags |= IOSQE_IO_LINK; 98 + 99 + io_uring_prep_rw(op, rw, 1 /*fds[1]*/, 0, 100 + iod->nr_sectors << 9, 101 + iod->start_sector << 9); 102 + rw->buf_index = tag; 103 + rw->flags |= IOSQE_FIXED_FILE; 104 + rw->flags |= IOSQE_IO_LINK; 105 + rw->user_data = build_user_data(tag, op, UBLK_IO_TGT_ZC_OP, 1); 106 + q->io_inflight++; 107 + 108 + io_uring_prep_buf_unregister(ureg, 0, tag, q->q_id, tag); 109 + ureg->user_data = build_user_data(tag, 0xff, UBLK_IO_TGT_ZC_BUF, 1); 110 + q->io_inflight++; 111 + 112 + return 0; 113 + } 114 + 115 + static int loop_queue_tgt_io(struct ublk_queue *q, int tag) 116 + { 117 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 118 + unsigned ublk_op = ublksrv_get_op(iod); 119 + struct io_uring_sqe *sqe; 120 + 121 + switch (ublk_op) { 122 + case UBLK_IO_OP_FLUSH: 123 + sqe = ublk_queue_alloc_sqe(q); 124 + if (!sqe) 125 + return -ENOMEM; 126 + io_uring_prep_sync_file_range(sqe, 1 /*fds[1]*/, 127 + iod->nr_sectors << 9, 128 + iod->start_sector << 9, 129 + IORING_FSYNC_DATASYNC); 130 + io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); 131 + q->io_inflight++; 132 + sqe->user_data = build_user_data(tag, ublk_op, UBLK_IO_TGT_NORMAL, 1); 133 + break; 134 + case UBLK_IO_OP_WRITE_ZEROES: 135 + case UBLK_IO_OP_DISCARD: 136 + return -ENOTSUP; 137 + case UBLK_IO_OP_READ: 138 + case UBLK_IO_OP_WRITE: 139 + loop_queue_tgt_rw_io(q, iod, tag); 140 + break; 141 + default: 142 + return -EINVAL; 143 + } 144 + 145 + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, 146 + iod->op_flags, iod->start_sector, iod->nr_sectors << 9); 147 + return 1; 148 + } 149 + 150 + static int ublk_loop_queue_io(struct ublk_queue *q, int tag) 151 + { 152 + int queued = loop_queue_tgt_io(q, tag); 153 + 154 + if (queued < 0) 155 + ublk_complete_io(q, tag, queued); 156 + 157 + return 0; 158 + } 159 + 160 + static void ublk_loop_io_done(struct ublk_queue *q, int tag, 161 + const struct io_uring_cqe *cqe) 162 + { 163 + int cqe_tag = user_data_to_tag(cqe->user_data); 164 + unsigned tgt_data = user_data_to_tgt_data(cqe->user_data); 165 + int res = cqe->res; 166 + 167 + if (res < 0 || tgt_data == UBLK_IO_TGT_NORMAL) 168 + goto complete; 169 + 170 + if (tgt_data == UBLK_IO_TGT_ZC_OP) { 171 + ublk_set_io_res(q, tag, cqe->res); 172 + goto exit; 173 + } 174 + assert(tgt_data == UBLK_IO_TGT_ZC_BUF); 175 + res = ublk_get_io_res(q, tag); 176 + complete: 177 + assert(tag == cqe_tag); 178 + ublk_complete_io(q, tag, res); 179 + exit: 180 + q->io_inflight--; 181 + } 182 + 183 + static int ublk_loop_tgt_init(struct ublk_dev *dev) 184 + { 185 + unsigned long long bytes; 186 + int ret; 187 + struct ublk_params p = { 188 + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, 189 + .basic = { 190 + .logical_bs_shift = 9, 191 + .physical_bs_shift = 12, 192 + .io_opt_shift = 12, 193 + .io_min_shift = 9, 194 + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, 195 + }, 196 + .dma = { 197 + .alignment = 511, 198 + }, 199 + }; 200 + 201 + assert(dev->tgt.nr_backing_files == 1); 202 + ret = backing_file_tgt_init(dev); 203 + if (ret) 204 + return ret; 205 + 206 + bytes = dev->tgt.backing_file_size[0]; 207 + dev->tgt.dev_size = bytes; 208 + p.basic.dev_sectors = bytes >> 9; 209 + dev->tgt.params = p; 210 + 211 + return 0; 212 + } 213 + 214 + const struct ublk_tgt_ops loop_tgt_ops = { 215 + .name = "loop", 216 + .init_tgt = ublk_loop_tgt_init, 217 + .deinit_tgt = backing_file_tgt_deinit, 218 + .queue_io = ublk_loop_queue_io, 219 + .tgt_io_done = ublk_loop_io_done, 220 + };
+1110
tools/testing/selftests/ublk/kublk.c
··· 1 + /* SPDX-License-Identifier: MIT */ 2 + /* 3 + * Description: uring_cmd based ublk 4 + */ 5 + 6 + #include "kublk.h" 7 + 8 + unsigned int ublk_dbg_mask = UBLK_LOG; 9 + static const struct ublk_tgt_ops *tgt_ops_list[] = { 10 + &null_tgt_ops, 11 + &loop_tgt_ops, 12 + }; 13 + 14 + static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) 15 + { 16 + const struct ublk_tgt_ops *ops; 17 + int i; 18 + 19 + if (name == NULL) 20 + return NULL; 21 + 22 + for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++) 23 + if (strcmp(tgt_ops_list[i]->name, name) == 0) 24 + return tgt_ops_list[i]; 25 + return NULL; 26 + } 27 + 28 + static inline int ublk_setup_ring(struct io_uring *r, int depth, 29 + int cq_depth, unsigned flags) 30 + { 31 + struct io_uring_params p; 32 + 33 + memset(&p, 0, sizeof(p)); 34 + p.flags = flags | IORING_SETUP_CQSIZE; 35 + p.cq_entries = cq_depth; 36 + 37 + return io_uring_queue_init_params(depth, r, &p); 38 + } 39 + 40 + static void ublk_ctrl_init_cmd(struct ublk_dev *dev, 41 + struct io_uring_sqe *sqe, 42 + struct ublk_ctrl_cmd_data *data) 43 + { 44 + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 45 + struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); 46 + 47 + sqe->fd = dev->ctrl_fd; 48 + sqe->opcode = IORING_OP_URING_CMD; 49 + sqe->ioprio = 0; 50 + 51 + if (data->flags & CTRL_CMD_HAS_BUF) { 52 + cmd->addr = data->addr; 53 + cmd->len = data->len; 54 + } 55 + 56 + if (data->flags & CTRL_CMD_HAS_DATA) 57 + cmd->data[0] = data->data[0]; 58 + 59 + cmd->dev_id = info->dev_id; 60 + cmd->queue_id = -1; 61 + 62 + ublk_set_sqe_cmd_op(sqe, data->cmd_op); 63 + 64 + io_uring_sqe_set_data(sqe, cmd); 65 + } 66 + 67 + static int __ublk_ctrl_cmd(struct ublk_dev *dev, 68 + struct ublk_ctrl_cmd_data *data) 69 + { 70 + struct io_uring_sqe *sqe; 71 + struct io_uring_cqe *cqe; 72 + int ret = -EINVAL; 73 + 74 + sqe = io_uring_get_sqe(&dev->ring); 75 + if (!sqe) { 76 + ublk_err("%s: can't get sqe ret %d\n", __func__, ret); 77 + return ret; 78 + } 79 + 80 + ublk_ctrl_init_cmd(dev, sqe, data); 81 + 82 + ret = io_uring_submit(&dev->ring); 83 + if (ret < 0) { 84 + ublk_err("uring submit ret %d\n", ret); 85 + return ret; 86 + } 87 + 88 + ret = io_uring_wait_cqe(&dev->ring, &cqe); 89 + if (ret < 0) { 90 + ublk_err("wait cqe: %s\n", strerror(-ret)); 91 + return ret; 92 + } 93 + io_uring_cqe_seen(&dev->ring, cqe); 94 + 95 + return cqe->res; 96 + } 97 + 98 + static int ublk_ctrl_stop_dev(struct ublk_dev *dev) 99 + { 100 + struct ublk_ctrl_cmd_data data = { 101 + .cmd_op = UBLK_CMD_STOP_DEV, 102 + }; 103 + 104 + return __ublk_ctrl_cmd(dev, &data); 105 + } 106 + 107 + static int ublk_ctrl_start_dev(struct ublk_dev *dev, 108 + int daemon_pid) 109 + { 110 + struct ublk_ctrl_cmd_data data = { 111 + .cmd_op = UBLK_U_CMD_START_DEV, 112 + .flags = CTRL_CMD_HAS_DATA, 113 + }; 114 + 115 + dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; 116 + 117 + return __ublk_ctrl_cmd(dev, &data); 118 + } 119 + 120 + static int ublk_ctrl_add_dev(struct ublk_dev *dev) 121 + { 122 + struct ublk_ctrl_cmd_data data = { 123 + .cmd_op = UBLK_U_CMD_ADD_DEV, 124 + .flags = CTRL_CMD_HAS_BUF, 125 + .addr = (__u64) (uintptr_t) &dev->dev_info, 126 + .len = sizeof(struct ublksrv_ctrl_dev_info), 127 + }; 128 + 129 + return __ublk_ctrl_cmd(dev, &data); 130 + } 131 + 132 + static int ublk_ctrl_del_dev(struct ublk_dev *dev) 133 + { 134 + struct ublk_ctrl_cmd_data data = { 135 + .cmd_op = UBLK_U_CMD_DEL_DEV, 136 + .flags = 0, 137 + }; 138 + 139 + return __ublk_ctrl_cmd(dev, &data); 140 + } 141 + 142 + static int ublk_ctrl_get_info(struct ublk_dev *dev) 143 + { 144 + struct ublk_ctrl_cmd_data data = { 145 + .cmd_op = UBLK_U_CMD_GET_DEV_INFO, 146 + .flags = CTRL_CMD_HAS_BUF, 147 + .addr = (__u64) (uintptr_t) &dev->dev_info, 148 + .len = sizeof(struct ublksrv_ctrl_dev_info), 149 + }; 150 + 151 + return __ublk_ctrl_cmd(dev, &data); 152 + } 153 + 154 + static int ublk_ctrl_set_params(struct ublk_dev *dev, 155 + struct ublk_params *params) 156 + { 157 + struct ublk_ctrl_cmd_data data = { 158 + .cmd_op = UBLK_U_CMD_SET_PARAMS, 159 + .flags = CTRL_CMD_HAS_BUF, 160 + .addr = (__u64) (uintptr_t) params, 161 + .len = sizeof(*params), 162 + }; 163 + params->len = sizeof(*params); 164 + return __ublk_ctrl_cmd(dev, &data); 165 + } 166 + 167 + static int ublk_ctrl_get_params(struct ublk_dev *dev, 168 + struct ublk_params *params) 169 + { 170 + struct ublk_ctrl_cmd_data data = { 171 + .cmd_op = UBLK_CMD_GET_PARAMS, 172 + .flags = CTRL_CMD_HAS_BUF, 173 + .addr = (__u64)params, 174 + .len = sizeof(*params), 175 + }; 176 + 177 + params->len = sizeof(*params); 178 + 179 + return __ublk_ctrl_cmd(dev, &data); 180 + } 181 + 182 + static int ublk_ctrl_get_features(struct ublk_dev *dev, 183 + __u64 *features) 184 + { 185 + struct ublk_ctrl_cmd_data data = { 186 + .cmd_op = UBLK_U_CMD_GET_FEATURES, 187 + .flags = CTRL_CMD_HAS_BUF, 188 + .addr = (__u64) (uintptr_t) features, 189 + .len = sizeof(*features), 190 + }; 191 + 192 + return __ublk_ctrl_cmd(dev, &data); 193 + } 194 + 195 + static const char *ublk_dev_state_desc(struct ublk_dev *dev) 196 + { 197 + switch (dev->dev_info.state) { 198 + case UBLK_S_DEV_DEAD: 199 + return "DEAD"; 200 + case UBLK_S_DEV_LIVE: 201 + return "LIVE"; 202 + case UBLK_S_DEV_QUIESCED: 203 + return "QUIESCED"; 204 + default: 205 + return "UNKNOWN"; 206 + }; 207 + } 208 + 209 + static void ublk_ctrl_dump(struct ublk_dev *dev) 210 + { 211 + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 212 + struct ublk_params p; 213 + int ret; 214 + 215 + ret = ublk_ctrl_get_params(dev, &p); 216 + if (ret < 0) { 217 + ublk_err("failed to get params %m\n"); 218 + return; 219 + } 220 + 221 + ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", 222 + info->dev_id, info->nr_hw_queues, info->queue_depth, 223 + 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); 224 + ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", 225 + info->max_io_buf_bytes, info->ublksrv_pid, info->flags, 226 + ublk_dev_state_desc(dev)); 227 + fflush(stdout); 228 + } 229 + 230 + static void ublk_ctrl_deinit(struct ublk_dev *dev) 231 + { 232 + close(dev->ctrl_fd); 233 + free(dev); 234 + } 235 + 236 + static struct ublk_dev *ublk_ctrl_init(void) 237 + { 238 + struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); 239 + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 240 + int ret; 241 + 242 + dev->ctrl_fd = open(CTRL_DEV, O_RDWR); 243 + if (dev->ctrl_fd < 0) { 244 + free(dev); 245 + return NULL; 246 + } 247 + 248 + info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; 249 + 250 + ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, 251 + UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); 252 + if (ret < 0) { 253 + ublk_err("queue_init: %s\n", strerror(-ret)); 254 + free(dev); 255 + return NULL; 256 + } 257 + dev->nr_fds = 1; 258 + 259 + return dev; 260 + } 261 + 262 + static int __ublk_queue_cmd_buf_sz(unsigned depth) 263 + { 264 + int size = depth * sizeof(struct ublksrv_io_desc); 265 + unsigned int page_sz = getpagesize(); 266 + 267 + return round_up(size, page_sz); 268 + } 269 + 270 + static int ublk_queue_max_cmd_buf_sz(void) 271 + { 272 + return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); 273 + } 274 + 275 + static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) 276 + { 277 + return __ublk_queue_cmd_buf_sz(q->q_depth); 278 + } 279 + 280 + static void ublk_queue_deinit(struct ublk_queue *q) 281 + { 282 + int i; 283 + int nr_ios = q->q_depth; 284 + 285 + io_uring_unregister_buffers(&q->ring); 286 + 287 + io_uring_unregister_ring_fd(&q->ring); 288 + 289 + if (q->ring.ring_fd > 0) { 290 + io_uring_unregister_files(&q->ring); 291 + close(q->ring.ring_fd); 292 + q->ring.ring_fd = -1; 293 + } 294 + 295 + if (q->io_cmd_buf) 296 + munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); 297 + 298 + for (i = 0; i < nr_ios; i++) 299 + free(q->ios[i].buf_addr); 300 + } 301 + 302 + static int ublk_queue_init(struct ublk_queue *q) 303 + { 304 + struct ublk_dev *dev = q->dev; 305 + int depth = dev->dev_info.queue_depth; 306 + int i, ret = -1; 307 + int cmd_buf_size, io_buf_size; 308 + unsigned long off; 309 + int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; 310 + 311 + q->tgt_ops = dev->tgt.ops; 312 + q->state = 0; 313 + q->q_depth = depth; 314 + q->cmd_inflight = 0; 315 + q->tid = gettid(); 316 + 317 + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { 318 + q->state |= UBLKSRV_NO_BUF; 319 + q->state |= UBLKSRV_ZC; 320 + } 321 + 322 + cmd_buf_size = ublk_queue_cmd_buf_sz(q); 323 + off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); 324 + q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, 325 + MAP_SHARED | MAP_POPULATE, dev->fds[0], off); 326 + if (q->io_cmd_buf == MAP_FAILED) { 327 + ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", 328 + q->dev->dev_info.dev_id, q->q_id); 329 + goto fail; 330 + } 331 + 332 + io_buf_size = dev->dev_info.max_io_buf_bytes; 333 + for (i = 0; i < q->q_depth; i++) { 334 + q->ios[i].buf_addr = NULL; 335 + q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; 336 + 337 + if (q->state & UBLKSRV_NO_BUF) 338 + continue; 339 + 340 + if (posix_memalign((void **)&q->ios[i].buf_addr, 341 + getpagesize(), io_buf_size)) { 342 + ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", 343 + dev->dev_info.dev_id, q->q_id, i); 344 + goto fail; 345 + } 346 + } 347 + 348 + ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth, 349 + IORING_SETUP_COOP_TASKRUN); 350 + if (ret < 0) { 351 + ublk_err("ublk dev %d queue %d setup io_uring failed %d\n", 352 + q->dev->dev_info.dev_id, q->q_id, ret); 353 + goto fail; 354 + } 355 + 356 + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { 357 + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); 358 + if (ret) { 359 + ublk_err("ublk dev %d queue %d register spare buffers failed %d", 360 + dev->dev_info.dev_id, q->q_id, ret); 361 + goto fail; 362 + } 363 + } 364 + 365 + io_uring_register_ring_fd(&q->ring); 366 + 367 + ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); 368 + if (ret) { 369 + ublk_err("ublk dev %d queue %d register files failed %d\n", 370 + q->dev->dev_info.dev_id, q->q_id, ret); 371 + goto fail; 372 + } 373 + 374 + return 0; 375 + fail: 376 + ublk_queue_deinit(q); 377 + ublk_err("ublk dev %d queue %d failed\n", 378 + dev->dev_info.dev_id, q->q_id); 379 + return -ENOMEM; 380 + } 381 + 382 + static int ublk_dev_prep(struct ublk_dev *dev) 383 + { 384 + int dev_id = dev->dev_info.dev_id; 385 + char buf[64]; 386 + int ret = 0; 387 + 388 + snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); 389 + dev->fds[0] = open(buf, O_RDWR); 390 + if (dev->fds[0] < 0) { 391 + ret = -EBADF; 392 + ublk_err("can't open %s, ret %d\n", buf, dev->fds[0]); 393 + goto fail; 394 + } 395 + 396 + if (dev->tgt.ops->init_tgt) 397 + ret = dev->tgt.ops->init_tgt(dev); 398 + 399 + return ret; 400 + fail: 401 + close(dev->fds[0]); 402 + return ret; 403 + } 404 + 405 + static void ublk_dev_unprep(struct ublk_dev *dev) 406 + { 407 + if (dev->tgt.ops->deinit_tgt) 408 + dev->tgt.ops->deinit_tgt(dev); 409 + close(dev->fds[0]); 410 + } 411 + 412 + int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) 413 + { 414 + struct ublksrv_io_cmd *cmd; 415 + struct io_uring_sqe *sqe; 416 + unsigned int cmd_op = 0; 417 + __u64 user_data; 418 + 419 + /* only freed io can be issued */ 420 + if (!(io->flags & UBLKSRV_IO_FREE)) 421 + return 0; 422 + 423 + /* we issue because we need either fetching or committing */ 424 + if (!(io->flags & 425 + (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP))) 426 + return 0; 427 + 428 + if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) 429 + cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; 430 + else if (io->flags & UBLKSRV_NEED_FETCH_RQ) 431 + cmd_op = UBLK_U_IO_FETCH_REQ; 432 + 433 + if (io_uring_sq_space_left(&q->ring) < 1) 434 + io_uring_submit(&q->ring); 435 + 436 + sqe = ublk_queue_alloc_sqe(q); 437 + if (!sqe) { 438 + ublk_err("%s: run out of sqe %d, tag %d\n", 439 + __func__, q->q_id, tag); 440 + return -1; 441 + } 442 + 443 + cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe); 444 + 445 + if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) 446 + cmd->result = io->result; 447 + 448 + /* These fields should be written once, never change */ 449 + ublk_set_sqe_cmd_op(sqe, cmd_op); 450 + sqe->fd = 0; /* dev->fds[0] */ 451 + sqe->opcode = IORING_OP_URING_CMD; 452 + sqe->flags = IOSQE_FIXED_FILE; 453 + sqe->rw_flags = 0; 454 + cmd->tag = tag; 455 + cmd->q_id = q->q_id; 456 + if (!(q->state & UBLKSRV_NO_BUF)) 457 + cmd->addr = (__u64) (uintptr_t) io->buf_addr; 458 + else 459 + cmd->addr = 0; 460 + 461 + user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); 462 + io_uring_sqe_set_data64(sqe, user_data); 463 + 464 + io->flags = 0; 465 + 466 + q->cmd_inflight += 1; 467 + 468 + ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n", 469 + __func__, q->q_id, tag, cmd_op, 470 + io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING)); 471 + return 1; 472 + } 473 + 474 + static void ublk_submit_fetch_commands(struct ublk_queue *q) 475 + { 476 + int i = 0; 477 + 478 + for (i = 0; i < q->q_depth; i++) 479 + ublk_queue_io_cmd(q, &q->ios[i], i); 480 + } 481 + 482 + static int ublk_queue_is_idle(struct ublk_queue *q) 483 + { 484 + return !io_uring_sq_ready(&q->ring) && !q->io_inflight; 485 + } 486 + 487 + static int ublk_queue_is_done(struct ublk_queue *q) 488 + { 489 + return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q); 490 + } 491 + 492 + static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, 493 + struct io_uring_cqe *cqe) 494 + { 495 + unsigned tag = user_data_to_tag(cqe->user_data); 496 + 497 + if (cqe->res < 0 && cqe->res != -EAGAIN) 498 + ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", 499 + __func__, cqe->res, q->q_id, 500 + user_data_to_tag(cqe->user_data), 501 + user_data_to_op(cqe->user_data)); 502 + 503 + if (q->tgt_ops->tgt_io_done) 504 + q->tgt_ops->tgt_io_done(q, tag, cqe); 505 + } 506 + 507 + static void ublk_handle_cqe(struct io_uring *r, 508 + struct io_uring_cqe *cqe, void *data) 509 + { 510 + struct ublk_queue *q = container_of(r, struct ublk_queue, ring); 511 + unsigned tag = user_data_to_tag(cqe->user_data); 512 + unsigned cmd_op = user_data_to_op(cqe->user_data); 513 + int fetch = (cqe->res != UBLK_IO_RES_ABORT) && 514 + !(q->state & UBLKSRV_QUEUE_STOPPING); 515 + struct ublk_io *io; 516 + 517 + if (cqe->res < 0 && cqe->res != -ENODEV) 518 + ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, 519 + cqe->res, cqe->user_data, q->state); 520 + 521 + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", 522 + __func__, cqe->res, q->q_id, tag, cmd_op, 523 + is_target_io(cqe->user_data), 524 + user_data_to_tgt_data(cqe->user_data), 525 + (q->state & UBLKSRV_QUEUE_STOPPING)); 526 + 527 + /* Don't retrieve io in case of target io */ 528 + if (is_target_io(cqe->user_data)) { 529 + ublksrv_handle_tgt_cqe(q, cqe); 530 + return; 531 + } 532 + 533 + io = &q->ios[tag]; 534 + q->cmd_inflight--; 535 + 536 + if (!fetch) { 537 + q->state |= UBLKSRV_QUEUE_STOPPING; 538 + io->flags &= ~UBLKSRV_NEED_FETCH_RQ; 539 + } 540 + 541 + if (cqe->res == UBLK_IO_RES_OK) { 542 + assert(tag < q->q_depth); 543 + if (q->tgt_ops->queue_io) 544 + q->tgt_ops->queue_io(q, tag); 545 + } else { 546 + /* 547 + * COMMIT_REQ will be completed immediately since no fetching 548 + * piggyback is required. 549 + * 550 + * Marking IO_FREE only, then this io won't be issued since 551 + * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) 552 + * 553 + * */ 554 + io->flags = UBLKSRV_IO_FREE; 555 + } 556 + } 557 + 558 + static int ublk_reap_events_uring(struct io_uring *r) 559 + { 560 + struct io_uring_cqe *cqe; 561 + unsigned head; 562 + int count = 0; 563 + 564 + io_uring_for_each_cqe(r, head, cqe) { 565 + ublk_handle_cqe(r, cqe, NULL); 566 + count += 1; 567 + } 568 + io_uring_cq_advance(r, count); 569 + 570 + return count; 571 + } 572 + 573 + static int ublk_process_io(struct ublk_queue *q) 574 + { 575 + int ret, reapped; 576 + 577 + ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n", 578 + q->dev->dev_info.dev_id, 579 + q->q_id, io_uring_sq_ready(&q->ring), 580 + q->cmd_inflight, 581 + (q->state & UBLKSRV_QUEUE_STOPPING)); 582 + 583 + if (ublk_queue_is_done(q)) 584 + return -ENODEV; 585 + 586 + ret = io_uring_submit_and_wait(&q->ring, 1); 587 + reapped = ublk_reap_events_uring(&q->ring); 588 + 589 + ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n", 590 + ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING), 591 + (q->state & UBLKSRV_QUEUE_IDLE)); 592 + 593 + return reapped; 594 + } 595 + 596 + static void *ublk_io_handler_fn(void *data) 597 + { 598 + struct ublk_queue *q = data; 599 + int dev_id = q->dev->dev_info.dev_id; 600 + int ret; 601 + 602 + ret = ublk_queue_init(q); 603 + if (ret) { 604 + ublk_err("ublk dev %d queue %d init queue failed\n", 605 + dev_id, q->q_id); 606 + return NULL; 607 + } 608 + ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n", 609 + q->tid, dev_id, q->q_id); 610 + 611 + /* submit all io commands to ublk driver */ 612 + ublk_submit_fetch_commands(q); 613 + do { 614 + if (ublk_process_io(q) < 0) 615 + break; 616 + } while (1); 617 + 618 + ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id); 619 + ublk_queue_deinit(q); 620 + return NULL; 621 + } 622 + 623 + static void ublk_set_parameters(struct ublk_dev *dev) 624 + { 625 + int ret; 626 + 627 + ret = ublk_ctrl_set_params(dev, &dev->tgt.params); 628 + if (ret) 629 + ublk_err("dev %d set basic parameter failed %d\n", 630 + dev->dev_info.dev_id, ret); 631 + } 632 + 633 + static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) 634 + { 635 + uint64_t id; 636 + int evtfd = ctx->_evtfd; 637 + 638 + if (evtfd < 0) 639 + return -EBADF; 640 + 641 + if (dev_id >= 0) 642 + id = dev_id + 1; 643 + else 644 + id = ERROR_EVTFD_DEVID; 645 + 646 + if (write(evtfd, &id, sizeof(id)) != sizeof(id)) 647 + return -EINVAL; 648 + 649 + return 0; 650 + } 651 + 652 + 653 + static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) 654 + { 655 + int ret, i; 656 + void *thread_ret; 657 + const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; 658 + 659 + ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); 660 + 661 + ret = ublk_dev_prep(dev); 662 + if (ret) 663 + return ret; 664 + 665 + for (i = 0; i < dinfo->nr_hw_queues; i++) { 666 + dev->q[i].dev = dev; 667 + dev->q[i].q_id = i; 668 + pthread_create(&dev->q[i].thread, NULL, 669 + ublk_io_handler_fn, 670 + &dev->q[i]); 671 + } 672 + 673 + /* everything is fine now, start us */ 674 + ublk_set_parameters(dev); 675 + ret = ublk_ctrl_start_dev(dev, getpid()); 676 + if (ret < 0) { 677 + ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); 678 + goto fail; 679 + } 680 + 681 + ublk_ctrl_get_info(dev); 682 + ublk_send_dev_event(ctx, dev->dev_info.dev_id); 683 + 684 + /* wait until we are terminated */ 685 + for (i = 0; i < dinfo->nr_hw_queues; i++) 686 + pthread_join(dev->q[i].thread, &thread_ret); 687 + fail: 688 + ublk_dev_unprep(dev); 689 + ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); 690 + 691 + return ret; 692 + } 693 + 694 + static int wait_ublk_dev(char *dev_name, int evt_mask, unsigned timeout) 695 + { 696 + #define EV_SIZE (sizeof(struct inotify_event)) 697 + #define EV_BUF_LEN (128 * (EV_SIZE + 16)) 698 + struct pollfd pfd; 699 + int fd, wd; 700 + int ret = -EINVAL; 701 + 702 + fd = inotify_init(); 703 + if (fd < 0) { 704 + ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); 705 + return fd; 706 + } 707 + 708 + wd = inotify_add_watch(fd, "/dev", evt_mask); 709 + if (wd == -1) { 710 + ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); 711 + goto fail; 712 + } 713 + 714 + pfd.fd = fd; 715 + pfd.events = POLL_IN; 716 + while (1) { 717 + int i = 0; 718 + char buffer[EV_BUF_LEN]; 719 + ret = poll(&pfd, 1, 1000 * timeout); 720 + 721 + if (ret == -1) { 722 + ublk_err("%s: poll inotify failed: %d\n", __func__, ret); 723 + goto rm_watch; 724 + } else if (ret == 0) { 725 + ublk_err("%s: poll inotify timeout\n", __func__); 726 + ret = -ETIMEDOUT; 727 + goto rm_watch; 728 + } 729 + 730 + ret = read(fd, buffer, EV_BUF_LEN); 731 + if (ret < 0) { 732 + ublk_err("%s: read inotify fd failed\n", __func__); 733 + goto rm_watch; 734 + } 735 + 736 + while (i < ret) { 737 + struct inotify_event *event = (struct inotify_event *)&buffer[i]; 738 + 739 + ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", 740 + __func__, event->mask, event->name); 741 + if (event->mask & evt_mask) { 742 + if (!strcmp(event->name, dev_name)) { 743 + ret = 0; 744 + goto rm_watch; 745 + } 746 + } 747 + i += EV_SIZE + event->len; 748 + } 749 + } 750 + rm_watch: 751 + inotify_rm_watch(fd, wd); 752 + fail: 753 + close(fd); 754 + return ret; 755 + } 756 + 757 + static int ublk_stop_io_daemon(const struct ublk_dev *dev) 758 + { 759 + int daemon_pid = dev->dev_info.ublksrv_pid; 760 + int dev_id = dev->dev_info.dev_id; 761 + char ublkc[64]; 762 + int ret = 0; 763 + 764 + /* daemon may be dead already */ 765 + if (kill(daemon_pid, 0) < 0) 766 + goto wait; 767 + 768 + /* 769 + * Wait until ublk char device is closed, when our daemon is shutdown 770 + */ 771 + snprintf(ublkc, sizeof(ublkc), "%s%d", "ublkc", dev_id); 772 + ret = wait_ublk_dev(ublkc, IN_CLOSE_WRITE, 10); 773 + /* double check and inotify may not be 100% reliable */ 774 + if (ret == -ETIMEDOUT) 775 + /* the daemon doesn't exist now if kill(0) fails */ 776 + ret = kill(daemon_pid, 0) < 0; 777 + wait: 778 + waitpid(daemon_pid, NULL, 0); 779 + ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", 780 + __func__, daemon_pid, dev_id, ret); 781 + 782 + return ret; 783 + } 784 + 785 + static int __cmd_dev_add(const struct dev_ctx *ctx) 786 + { 787 + unsigned nr_queues = ctx->nr_hw_queues; 788 + const char *tgt_type = ctx->tgt_type; 789 + unsigned depth = ctx->queue_depth; 790 + __u64 features; 791 + const struct ublk_tgt_ops *ops; 792 + struct ublksrv_ctrl_dev_info *info; 793 + struct ublk_dev *dev; 794 + int dev_id = ctx->dev_id; 795 + int ret, i; 796 + 797 + ops = ublk_find_tgt(tgt_type); 798 + if (!ops) { 799 + ublk_err("%s: no such tgt type, type %s\n", 800 + __func__, tgt_type); 801 + return -ENODEV; 802 + } 803 + 804 + if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { 805 + ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", 806 + __func__, nr_queues, depth); 807 + return -EINVAL; 808 + } 809 + 810 + dev = ublk_ctrl_init(); 811 + if (!dev) { 812 + ublk_err("%s: can't alloc dev id %d, type %s\n", 813 + __func__, dev_id, tgt_type); 814 + return -ENOMEM; 815 + } 816 + 817 + /* kernel doesn't support get_features */ 818 + ret = ublk_ctrl_get_features(dev, &features); 819 + if (ret < 0) 820 + return -EINVAL; 821 + 822 + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) 823 + return -ENOTSUP; 824 + 825 + info = &dev->dev_info; 826 + info->dev_id = ctx->dev_id; 827 + info->nr_hw_queues = nr_queues; 828 + info->queue_depth = depth; 829 + info->flags = ctx->flags; 830 + dev->tgt.ops = ops; 831 + dev->tgt.sq_depth = depth; 832 + dev->tgt.cq_depth = depth; 833 + 834 + for (i = 0; i < MAX_BACK_FILES; i++) { 835 + if (ctx->files[i]) { 836 + strcpy(dev->tgt.backing_file[i], ctx->files[i]); 837 + dev->tgt.nr_backing_files++; 838 + } 839 + } 840 + 841 + ret = ublk_ctrl_add_dev(dev); 842 + if (ret < 0) { 843 + ublk_err("%s: can't add dev id %d, type %s ret %d\n", 844 + __func__, dev_id, tgt_type, ret); 845 + goto fail; 846 + } 847 + 848 + ret = ublk_start_daemon(ctx, dev); 849 + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); 850 + 851 + fail: 852 + if (ret < 0) 853 + ublk_send_dev_event(ctx, -1); 854 + ublk_ctrl_deinit(dev); 855 + return ret; 856 + } 857 + 858 + static int __cmd_dev_list(struct dev_ctx *ctx); 859 + 860 + static int cmd_dev_add(struct dev_ctx *ctx) 861 + { 862 + int res; 863 + 864 + ctx->_evtfd = eventfd(0, 0); 865 + if (ctx->_evtfd < 0) { 866 + ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); 867 + exit(-1); 868 + } 869 + 870 + setsid(); 871 + res = fork(); 872 + if (res == 0) { 873 + __cmd_dev_add(ctx); 874 + exit(EXIT_SUCCESS); 875 + } else if (res > 0) { 876 + uint64_t id; 877 + 878 + res = read(ctx->_evtfd, &id, sizeof(id)); 879 + close(ctx->_evtfd); 880 + if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { 881 + ctx->dev_id = id - 1; 882 + return __cmd_dev_list(ctx); 883 + } 884 + exit(EXIT_FAILURE); 885 + } else { 886 + return res; 887 + } 888 + } 889 + 890 + static int __cmd_dev_del(struct dev_ctx *ctx) 891 + { 892 + int number = ctx->dev_id; 893 + struct ublk_dev *dev; 894 + int ret; 895 + 896 + dev = ublk_ctrl_init(); 897 + dev->dev_info.dev_id = number; 898 + 899 + ret = ublk_ctrl_get_info(dev); 900 + if (ret < 0) 901 + goto fail; 902 + 903 + ret = ublk_ctrl_stop_dev(dev); 904 + if (ret < 0) 905 + ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); 906 + 907 + ret = ublk_stop_io_daemon(dev); 908 + if (ret < 0) 909 + ublk_err("%s: stop daemon id %d dev %d, ret %d\n", 910 + __func__, dev->dev_info.ublksrv_pid, number, ret); 911 + ublk_ctrl_del_dev(dev); 912 + fail: 913 + if (ret >= 0) 914 + ret = ublk_ctrl_get_info(dev); 915 + ublk_ctrl_deinit(dev); 916 + 917 + return (ret >= 0) ? 0 : ret; 918 + } 919 + 920 + static int cmd_dev_del(struct dev_ctx *ctx) 921 + { 922 + int i; 923 + 924 + if (ctx->dev_id >= 0 || !ctx->all) 925 + return __cmd_dev_del(ctx); 926 + 927 + for (i = 0; i < 255; i++) { 928 + ctx->dev_id = i; 929 + __cmd_dev_del(ctx); 930 + } 931 + return 0; 932 + } 933 + 934 + static int __cmd_dev_list(struct dev_ctx *ctx) 935 + { 936 + struct ublk_dev *dev = ublk_ctrl_init(); 937 + int ret; 938 + 939 + if (!dev) 940 + return -ENODEV; 941 + 942 + dev->dev_info.dev_id = ctx->dev_id; 943 + 944 + ret = ublk_ctrl_get_info(dev); 945 + if (ret < 0) { 946 + if (ctx->logging) 947 + ublk_err("%s: can't get dev info from %d: %d\n", 948 + __func__, ctx->dev_id, ret); 949 + } else { 950 + ublk_ctrl_dump(dev); 951 + } 952 + 953 + ublk_ctrl_deinit(dev); 954 + 955 + return ret; 956 + } 957 + 958 + static int cmd_dev_list(struct dev_ctx *ctx) 959 + { 960 + int i; 961 + 962 + if (ctx->dev_id >= 0 || !ctx->all) 963 + return __cmd_dev_list(ctx); 964 + 965 + ctx->logging = false; 966 + for (i = 0; i < 255; i++) { 967 + ctx->dev_id = i; 968 + __cmd_dev_list(ctx); 969 + } 970 + return 0; 971 + } 972 + 973 + static int cmd_dev_get_features(void) 974 + { 975 + #define const_ilog2(x) (63 - __builtin_clzll(x)) 976 + static const char *feat_map[] = { 977 + [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", 978 + [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", 979 + [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", 980 + [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", 981 + [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", 982 + [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", 983 + [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", 984 + [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", 985 + [const_ilog2(UBLK_F_ZONED)] = "ZONED", 986 + [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", 987 + }; 988 + struct ublk_dev *dev; 989 + __u64 features = 0; 990 + int ret; 991 + 992 + dev = ublk_ctrl_init(); 993 + if (!dev) { 994 + fprintf(stderr, "ublksrv_ctrl_init failed id\n"); 995 + return -EOPNOTSUPP; 996 + } 997 + 998 + ret = ublk_ctrl_get_features(dev, &features); 999 + if (!ret) { 1000 + int i; 1001 + 1002 + printf("ublk_drv features: 0x%llx\n", features); 1003 + 1004 + for (i = 0; i < sizeof(features) * 8; i++) { 1005 + const char *feat; 1006 + 1007 + if (!((1ULL << i) & features)) 1008 + continue; 1009 + if (i < sizeof(feat_map) / sizeof(feat_map[0])) 1010 + feat = feat_map[i]; 1011 + else 1012 + feat = "unknown"; 1013 + printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); 1014 + } 1015 + } 1016 + 1017 + return ret; 1018 + } 1019 + 1020 + static int cmd_dev_help(char *exe) 1021 + { 1022 + printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe); 1023 + printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); 1024 + printf("%s del [-n dev_id] -a \n", exe); 1025 + printf("\t -a delete all devices -n delete specified device\n"); 1026 + printf("%s list [-n dev_id] -a \n", exe); 1027 + printf("\t -a list all devices, -n list specified device, default -a \n"); 1028 + printf("%s features\n", exe); 1029 + return 0; 1030 + } 1031 + 1032 + int main(int argc, char *argv[]) 1033 + { 1034 + static const struct option longopts[] = { 1035 + { "all", 0, NULL, 'a' }, 1036 + { "type", 1, NULL, 't' }, 1037 + { "number", 1, NULL, 'n' }, 1038 + { "queues", 1, NULL, 'q' }, 1039 + { "depth", 1, NULL, 'd' }, 1040 + { "debug_mask", 1, NULL, 0 }, 1041 + { "quiet", 0, NULL, 0 }, 1042 + { "zero_copy", 1, NULL, 'z' }, 1043 + { 0, 0, 0, 0 } 1044 + }; 1045 + int option_idx, opt; 1046 + const char *cmd = argv[1]; 1047 + struct dev_ctx ctx = { 1048 + .queue_depth = 128, 1049 + .nr_hw_queues = 2, 1050 + .dev_id = -1, 1051 + .tgt_type = "unknown", 1052 + }; 1053 + int ret = -EINVAL, i; 1054 + 1055 + if (argc == 1) 1056 + return ret; 1057 + 1058 + optind = 2; 1059 + while ((opt = getopt_long(argc, argv, "t:n:d:q:a:z", 1060 + longopts, &option_idx)) != -1) { 1061 + switch (opt) { 1062 + case 'a': 1063 + ctx.all = 1; 1064 + break; 1065 + case 'n': 1066 + ctx.dev_id = strtol(optarg, NULL, 10); 1067 + break; 1068 + case 't': 1069 + if (strlen(optarg) < sizeof(ctx.tgt_type)) 1070 + strcpy(ctx.tgt_type, optarg); 1071 + break; 1072 + case 'q': 1073 + ctx.nr_hw_queues = strtol(optarg, NULL, 10); 1074 + break; 1075 + case 'd': 1076 + ctx.queue_depth = strtol(optarg, NULL, 10); 1077 + break; 1078 + case 'z': 1079 + ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; 1080 + break; 1081 + case 0: 1082 + if (!strcmp(longopts[option_idx].name, "debug_mask")) 1083 + ublk_dbg_mask = strtol(optarg, NULL, 16); 1084 + if (!strcmp(longopts[option_idx].name, "quiet")) 1085 + ublk_dbg_mask = 0; 1086 + break; 1087 + } 1088 + } 1089 + 1090 + i = optind; 1091 + while (i < argc && ctx.nr_files < MAX_BACK_FILES) { 1092 + ctx.files[ctx.nr_files++] = argv[i++]; 1093 + } 1094 + 1095 + if (!strcmp(cmd, "add")) 1096 + ret = cmd_dev_add(&ctx); 1097 + else if (!strcmp(cmd, "del")) 1098 + ret = cmd_dev_del(&ctx); 1099 + else if (!strcmp(cmd, "list")) { 1100 + ctx.all = 1; 1101 + ret = cmd_dev_list(&ctx); 1102 + } else if (!strcmp(cmd, "help")) 1103 + ret = cmd_dev_help(argv[0]); 1104 + else if (!strcmp(cmd, "features")) 1105 + ret = cmd_dev_get_features(); 1106 + else 1107 + cmd_dev_help(argv[0]); 1108 + 1109 + return ret; 1110 + }
+326
tools/testing/selftests/ublk/kublk.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef KUBLK_INTERNAL_H 3 + #define KUBLK_INTERNAL_H 4 + 5 + #include <unistd.h> 6 + #include <stdlib.h> 7 + #include <assert.h> 8 + #include <stdio.h> 9 + #include <stdarg.h> 10 + #include <string.h> 11 + #include <pthread.h> 12 + #include <getopt.h> 13 + #include <limits.h> 14 + #include <poll.h> 15 + #include <sys/syscall.h> 16 + #include <sys/mman.h> 17 + #include <sys/ioctl.h> 18 + #include <sys/inotify.h> 19 + #include <sys/wait.h> 20 + #include <sys/eventfd.h> 21 + #include <liburing.h> 22 + #include <linux/ublk_cmd.h> 23 + 24 + #define __maybe_unused __attribute__((unused)) 25 + #define MAX_BACK_FILES 4 26 + #ifndef min 27 + #define min(a, b) ((a) < (b) ? (a) : (b)) 28 + #endif 29 + 30 + /****************** part 1: libublk ********************/ 31 + 32 + #define CTRL_DEV "/dev/ublk-control" 33 + #define UBLKC_DEV "/dev/ublkc" 34 + #define UBLKB_DEV "/dev/ublkb" 35 + #define UBLK_CTRL_RING_DEPTH 32 36 + #define ERROR_EVTFD_DEVID -2 37 + 38 + /* queue idle timeout */ 39 + #define UBLKSRV_IO_IDLE_SECS 20 40 + 41 + #define UBLK_IO_MAX_BYTES 65536 42 + #define UBLK_MAX_QUEUES 4 43 + #define UBLK_QUEUE_DEPTH 128 44 + 45 + #define UBLK_IO_TGT_NORMAL 0 46 + #define UBLK_IO_TGT_ZC_BUF 1 47 + #define UBLK_IO_TGT_ZC_OP 2 48 + 49 + #define UBLK_DBG_DEV (1U << 0) 50 + #define UBLK_DBG_QUEUE (1U << 1) 51 + #define UBLK_DBG_IO_CMD (1U << 2) 52 + #define UBLK_DBG_IO (1U << 3) 53 + #define UBLK_DBG_CTRL_CMD (1U << 4) 54 + #define UBLK_LOG (1U << 5) 55 + 56 + struct ublk_dev; 57 + struct ublk_queue; 58 + 59 + struct dev_ctx { 60 + char tgt_type[16]; 61 + unsigned long flags; 62 + unsigned nr_hw_queues; 63 + unsigned queue_depth; 64 + int dev_id; 65 + int nr_files; 66 + char *files[MAX_BACK_FILES]; 67 + unsigned int logging:1; 68 + unsigned int all:1; 69 + 70 + int _evtfd; 71 + }; 72 + 73 + struct ublk_ctrl_cmd_data { 74 + __u32 cmd_op; 75 + #define CTRL_CMD_HAS_DATA 1 76 + #define CTRL_CMD_HAS_BUF 2 77 + __u32 flags; 78 + 79 + __u64 data[2]; 80 + __u64 addr; 81 + __u32 len; 82 + }; 83 + 84 + struct ublk_io { 85 + char *buf_addr; 86 + 87 + #define UBLKSRV_NEED_FETCH_RQ (1UL << 0) 88 + #define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1) 89 + #define UBLKSRV_IO_FREE (1UL << 2) 90 + unsigned short flags; 91 + unsigned short refs; /* used by target code only */ 92 + 93 + int result; 94 + }; 95 + 96 + struct ublk_tgt_ops { 97 + const char *name; 98 + int (*init_tgt)(struct ublk_dev *); 99 + void (*deinit_tgt)(struct ublk_dev *); 100 + 101 + int (*queue_io)(struct ublk_queue *, int tag); 102 + void (*tgt_io_done)(struct ublk_queue *, 103 + int tag, const struct io_uring_cqe *); 104 + }; 105 + 106 + struct ublk_tgt { 107 + unsigned long dev_size; 108 + unsigned int sq_depth; 109 + unsigned int cq_depth; 110 + const struct ublk_tgt_ops *ops; 111 + struct ublk_params params; 112 + 113 + int nr_backing_files; 114 + unsigned long backing_file_size[MAX_BACK_FILES]; 115 + char backing_file[MAX_BACK_FILES][PATH_MAX]; 116 + }; 117 + 118 + struct ublk_queue { 119 + int q_id; 120 + int q_depth; 121 + unsigned int cmd_inflight; 122 + unsigned int io_inflight; 123 + struct ublk_dev *dev; 124 + const struct ublk_tgt_ops *tgt_ops; 125 + char *io_cmd_buf; 126 + struct io_uring ring; 127 + struct ublk_io ios[UBLK_QUEUE_DEPTH]; 128 + #define UBLKSRV_QUEUE_STOPPING (1U << 0) 129 + #define UBLKSRV_QUEUE_IDLE (1U << 1) 130 + #define UBLKSRV_NO_BUF (1U << 2) 131 + #define UBLKSRV_ZC (1U << 3) 132 + unsigned state; 133 + pid_t tid; 134 + pthread_t thread; 135 + }; 136 + 137 + struct ublk_dev { 138 + struct ublk_tgt tgt; 139 + struct ublksrv_ctrl_dev_info dev_info; 140 + struct ublk_queue q[UBLK_MAX_QUEUES]; 141 + 142 + int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ 143 + int nr_fds; 144 + int ctrl_fd; 145 + struct io_uring ring; 146 + }; 147 + 148 + #ifndef offsetof 149 + #define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) 150 + #endif 151 + 152 + #ifndef container_of 153 + #define container_of(ptr, type, member) ({ \ 154 + unsigned long __mptr = (unsigned long)(ptr); \ 155 + ((type *)(__mptr - offsetof(type, member))); }) 156 + #endif 157 + 158 + #define round_up(val, rnd) \ 159 + (((val) + ((rnd) - 1)) & ~((rnd) - 1)) 160 + 161 + 162 + extern unsigned int ublk_dbg_mask; 163 + extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag); 164 + 165 + static inline int is_target_io(__u64 user_data) 166 + { 167 + return (user_data & (1ULL << 63)) != 0; 168 + } 169 + 170 + static inline __u64 build_user_data(unsigned tag, unsigned op, 171 + unsigned tgt_data, unsigned is_target_io) 172 + { 173 + assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16)); 174 + 175 + return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63; 176 + } 177 + 178 + static inline unsigned int user_data_to_tag(__u64 user_data) 179 + { 180 + return user_data & 0xffff; 181 + } 182 + 183 + static inline unsigned int user_data_to_op(__u64 user_data) 184 + { 185 + return (user_data >> 16) & 0xff; 186 + } 187 + 188 + static inline unsigned int user_data_to_tgt_data(__u64 user_data) 189 + { 190 + return (user_data >> 24) & 0xffff; 191 + } 192 + 193 + static inline void ublk_err(const char *fmt, ...) 194 + { 195 + va_list ap; 196 + 197 + va_start(ap, fmt); 198 + vfprintf(stderr, fmt, ap); 199 + } 200 + 201 + static inline void ublk_log(const char *fmt, ...) 202 + { 203 + if (ublk_dbg_mask & UBLK_LOG) { 204 + va_list ap; 205 + 206 + va_start(ap, fmt); 207 + vfprintf(stdout, fmt, ap); 208 + } 209 + } 210 + 211 + static inline void ublk_dbg(int level, const char *fmt, ...) 212 + { 213 + if (level & ublk_dbg_mask) { 214 + va_list ap; 215 + 216 + va_start(ap, fmt); 217 + vfprintf(stdout, fmt, ap); 218 + } 219 + } 220 + 221 + static inline struct io_uring_sqe *ublk_queue_alloc_sqe(struct ublk_queue *q) 222 + { 223 + unsigned left = io_uring_sq_space_left(&q->ring); 224 + 225 + if (left < 1) 226 + io_uring_submit(&q->ring); 227 + return io_uring_get_sqe(&q->ring); 228 + } 229 + 230 + static inline void ublk_queue_alloc_sqe3(struct ublk_queue *q, 231 + struct io_uring_sqe **sqe1, struct io_uring_sqe **sqe2, 232 + struct io_uring_sqe **sqe3) 233 + { 234 + struct io_uring *r = &q->ring; 235 + unsigned left = io_uring_sq_space_left(r); 236 + 237 + if (left < 3) 238 + io_uring_submit(r); 239 + 240 + *sqe1 = io_uring_get_sqe(r); 241 + *sqe2 = io_uring_get_sqe(r); 242 + *sqe3 = io_uring_get_sqe(r); 243 + } 244 + 245 + static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, 246 + int dev_fd, int tag, int q_id, __u64 index) 247 + { 248 + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; 249 + 250 + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); 251 + sqe->opcode = IORING_OP_URING_CMD; 252 + sqe->flags |= IOSQE_FIXED_FILE; 253 + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; 254 + 255 + cmd->tag = tag; 256 + cmd->addr = index; 257 + cmd->q_id = q_id; 258 + } 259 + 260 + static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, 261 + int dev_fd, int tag, int q_id, __u64 index) 262 + { 263 + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; 264 + 265 + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); 266 + sqe->opcode = IORING_OP_URING_CMD; 267 + sqe->flags |= IOSQE_FIXED_FILE; 268 + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; 269 + 270 + cmd->tag = tag; 271 + cmd->addr = index; 272 + cmd->q_id = q_id; 273 + } 274 + 275 + static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) 276 + { 277 + return (void *)&sqe->cmd; 278 + } 279 + 280 + static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res) 281 + { 282 + q->ios[tag].result = res; 283 + } 284 + 285 + static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) 286 + { 287 + return q->ios[tag].result; 288 + } 289 + 290 + static inline void ublk_mark_io_done(struct ublk_io *io, int res) 291 + { 292 + io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); 293 + io->result = res; 294 + } 295 + 296 + static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) 297 + { 298 + return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); 299 + } 300 + 301 + static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) 302 + { 303 + __u32 *addr = (__u32 *)&sqe->off; 304 + 305 + addr[0] = cmd_op; 306 + addr[1] = 0; 307 + } 308 + 309 + static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) 310 + { 311 + struct ublk_io *io = &q->ios[tag]; 312 + 313 + ublk_mark_io_done(io, res); 314 + 315 + return ublk_queue_io_cmd(q, io, tag); 316 + } 317 + 318 + static inline int ublk_queue_use_zc(const struct ublk_queue *q) 319 + { 320 + return q->state & UBLKSRV_ZC; 321 + } 322 + 323 + extern const struct ublk_tgt_ops null_tgt_ops; 324 + extern const struct ublk_tgt_ops loop_tgt_ops; 325 + 326 + #endif
+38
tools/testing/selftests/ublk/null.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #include "kublk.h" 4 + 5 + static int ublk_null_tgt_init(struct ublk_dev *dev) 6 + { 7 + const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; 8 + unsigned long dev_size = 250UL << 30; 9 + 10 + dev->tgt.dev_size = dev_size; 11 + dev->tgt.params = (struct ublk_params) { 12 + .types = UBLK_PARAM_TYPE_BASIC, 13 + .basic = { 14 + .logical_bs_shift = 9, 15 + .physical_bs_shift = 12, 16 + .io_opt_shift = 12, 17 + .io_min_shift = 9, 18 + .max_sectors = info->max_io_buf_bytes >> 9, 19 + .dev_sectors = dev_size >> 9, 20 + }, 21 + }; 22 + 23 + return 0; 24 + } 25 + 26 + static int ublk_null_queue_io(struct ublk_queue *q, int tag) 27 + { 28 + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); 29 + 30 + ublk_complete_io(q, tag, iod->nr_sectors << 9); 31 + return 0; 32 + } 33 + 34 + const struct ublk_tgt_ops null_tgt_ops = { 35 + .name = "null", 36 + .init_tgt = ublk_null_tgt_init, 37 + .queue_io = ublk_null_queue_io, 38 + };
+113
tools/testing/selftests/ublk/test_common.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + _create_backfile() { 5 + local my_size=$1 6 + local my_file=`mktemp ublk_bpf_${my_size}_XXXXX` 7 + 8 + truncate -s ${my_size} ${my_file} 9 + echo $my_file 10 + } 11 + 12 + _remove_backfile() { 13 + local file=$1 14 + 15 + [ -f "$file" ] && rm -f $file 16 + } 17 + 18 + _create_tmp_dir() { 19 + local my_file=`mktemp -d ublk_bpf_dir_XXXXX` 20 + 21 + echo $my_file 22 + } 23 + 24 + _remove_tmp_dir() { 25 + local dir=$1 26 + 27 + [ -d "$dir" ] && rmdir $dir 28 + } 29 + 30 + _mkfs_mount_test() 31 + { 32 + local dev=$1 33 + local err_code=0 34 + local mnt_dir=`_create_tmp_dir` 35 + 36 + mkfs.ext4 -F $dev > /dev/null 2>&1 37 + err_code=$? 38 + if [ $err_code -ne 0 ]; then 39 + return $err_code 40 + fi 41 + 42 + mount -t ext4 $dev $mnt_dir > /dev/null 2>&1 43 + umount $dev 44 + err_code=$? 45 + _remove_tmp_dir $mnt_dir 46 + if [ $err_code -ne 0 ]; then 47 + return $err_code 48 + fi 49 + } 50 + 51 + _check_root() { 52 + local ksft_skip=4 53 + 54 + if [ $UID != 0 ]; then 55 + echo please run this as root >&2 56 + exit $ksft_skip 57 + fi 58 + } 59 + 60 + _remove_ublk_devices() { 61 + ${UBLK_PROG} del -a 62 + } 63 + 64 + _get_ublk_dev_state() { 65 + ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' 66 + } 67 + 68 + _get_ublk_daemon_pid() { 69 + ${UBLK_PROG} list -n "$1" | grep "pid" | awk '{print $7}' 70 + } 71 + 72 + _prep_test() { 73 + _check_root 74 + local type=$1 75 + shift 1 76 + echo "ublk $type: $@" 77 + } 78 + 79 + _show_result() 80 + { 81 + if [ $2 -ne 0 ]; then 82 + echo "$1 : [FAIL]" 83 + else 84 + echo "$1 : [PASS]" 85 + fi 86 + } 87 + 88 + _cleanup_test() { 89 + ${UBLK_PROG} del -n $1 90 + } 91 + 92 + _add_ublk_dev() { 93 + local kublk_temp=`mktemp /tmp/kublk-XXXXXX` 94 + ${UBLK_PROG} add $@ > ${kublk_temp} 2>&1 95 + if [ $? -ne 0 ]; then 96 + echo "fail to add ublk dev $@" 97 + exit -1 98 + fi 99 + local dev_id=`grep "dev id" ${kublk_temp} | awk -F '[ :]' '{print $3}'` 100 + udevadm settle 101 + rm -f ${kublk_temp} 102 + echo ${dev_id} 103 + } 104 + 105 + _have_feature() 106 + { 107 + if $UBLK_PROG "features" | grep $1 > /dev/null 2>&1; then 108 + return 0 109 + fi 110 + return 1 111 + } 112 + 113 + export UBLK_PROG=$(pwd)/kublk
+31
tools/testing/selftests/ublk/test_loop_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . test_common.sh 5 + 6 + TID="loop_01" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "write and verify test" 10 + 11 + backfile_0=`_create_backfile 256M` 12 + 13 + dev_id=`_add_ublk_dev -t loop $backfile_0` 14 + 15 + # run fio over the ublk disk 16 + fio --name=write_and_verify \ 17 + --filename=/dev/ublkb${dev_id} \ 18 + --ioengine=libaio --iodepth=16 \ 19 + --rw=write \ 20 + --size=256M \ 21 + --direct=1 \ 22 + --verify=crc32c \ 23 + --do_verify=1 \ 24 + --bs=4k > /dev/null 2>&1 25 + ERR_CODE=$? 26 + 27 + _cleanup_test ${dev_id} "loop" 28 + 29 + _remove_backfile $backfile_0 30 + 31 + _show_result $TID $ERR_CODE
+22
tools/testing/selftests/ublk/test_loop_02.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . test_common.sh 5 + 6 + TID="loop_02" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "mkfs & mount & umount" 10 + 11 + backfile_0=`_create_backfile 256M` 12 + 13 + dev_id=`_add_ublk_dev -t loop $backfile_0` 14 + 15 + _mkfs_mount_test /dev/ublkb${dev_id} 16 + ERR_CODE=$? 17 + 18 + _cleanup_test ${dev_id} "loop" 19 + 20 + _remove_backfile $backfile_0 21 + 22 + _show_result $TID $ERR_CODE
+33
tools/testing/selftests/ublk/test_loop_03.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . test_common.sh 5 + 6 + TID="loop_03" 7 + ERR_CODE=0 8 + 9 + _have_feature "ZERO_COPY" || exit 4 10 + 11 + _prep_test "loop" "write and verify over zero copy" 12 + 13 + backfile_0=`_create_backfile 256M` 14 + 15 + dev_id=`_add_ublk_dev -t loop $backfile_0 -z` 16 + 17 + # run fio over the ublk disk 18 + fio --name=write_and_verify \ 19 + --filename=/dev/ublkb${dev_id} \ 20 + --ioengine=libaio --iodepth=64 \ 21 + --rw=write \ 22 + --size=256M \ 23 + --direct=1 \ 24 + --verify=crc32c \ 25 + --do_verify=1 \ 26 + --bs=4k > /dev/null 2>&1 27 + ERR_CODE=$? 28 + 29 + _cleanup_test ${dev_id} "loop" 30 + 31 + _remove_backfile $backfile_0 32 + 33 + _show_result $TID $ERR_CODE
+22
tools/testing/selftests/ublk/test_loop_04.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . test_common.sh 5 + 6 + TID="loop_04" 7 + ERR_CODE=0 8 + 9 + _prep_test "loop" "mkfs & mount & umount with zero copy" 10 + 11 + backfile_0=`_create_backfile 256M` 12 + 13 + dev_id=`_add_ublk_dev -t loop -z $backfile_0` 14 + 15 + _mkfs_mount_test /dev/ublkb${dev_id} 16 + ERR_CODE=$? 17 + 18 + _cleanup_test ${dev_id} "loop" 19 + 20 + _remove_backfile $backfile_0 21 + 22 + _show_result $TID $ERR_CODE
+19
tools/testing/selftests/ublk/test_null_01.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . test_common.sh 5 + 6 + TID="null_01" 7 + ERR_CODE=0 8 + 9 + _prep_test "null" "basic IO test" 10 + 11 + dev_id=`_add_ublk_dev -t null` 12 + 13 + # run fio over the two disks 14 + fio --name=job1 --filename=/dev/ublkb${dev_id} --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 15 + ERR_CODE=$? 16 + 17 + _cleanup_test ${dev_id} "null" 18 + 19 + _show_result $TID $ERR_CODE