Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add support for IORING_SETUP_SQE_MIXED

Normal rings support 64b SQEs for posting submissions, while certain
features require the ring to be configured with IORING_SETUP_SQE128, as
they need to convey more information per submission. This, in turn,
makes ALL the SQEs be 128b in size. This is somewhat wasteful and
inefficient, particularly when only certain SQEs need to be of the
bigger variant.

This adds support for setting up a ring with mixed SQE sizes, using
IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring
may be either 64b or 128b in size. If a SQE is 128b in size, then opcode
will be set to a variante to indicate that this is the case. Any other
non-128b opcode will assume the SQ's default size.

SQEs on these types of mixed rings may also utilize NOP with skip
success set. This can happen if the ring is one (small) SQE entry away
from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be
contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this
case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag
set. The kernel will process this as a normal NOP and without posting a
CQE.

Signed-off-by: Keith Busch <kbusch@kernel.org>
[axboe: {} style fix and assign sqe before opcode read]
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Keith Busch and committed by
Jens Axboe
1cba30bf 5b6d8a03

+114 -26
+8
include/uapi/linux/io_uring.h
··· 231 231 */ 232 232 #define IORING_SETUP_CQE_MIXED (1U << 18) 233 233 234 + /* 235 + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have 236 + * a 128b opcode. 237 + */ 238 + #define IORING_SETUP_SQE_MIXED (1U << 19) 239 + 234 240 enum io_uring_op { 235 241 IORING_OP_NOP, 236 242 IORING_OP_READV, ··· 301 295 IORING_OP_READV_FIXED, 302 296 IORING_OP_WRITEV_FIXED, 303 297 IORING_OP_PIPE, 298 + IORING_OP_NOP128, 299 + IORING_OP_URING_CMD128, 304 300 305 301 /* this goes last, obviously */ 306 302 IORING_OP_LAST,
+27 -7
io_uring/fdinfo.c
··· 14 14 #include "fdinfo.h" 15 15 #include "cancel.h" 16 16 #include "rsrc.h" 17 + #include "opdef.h" 17 18 18 19 #ifdef CONFIG_NET_RX_BUSY_POLL 19 20 static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, ··· 67 66 unsigned int cq_head = READ_ONCE(r->cq.head); 68 67 unsigned int cq_tail = READ_ONCE(r->cq.tail); 69 68 unsigned int sq_shift = 0; 70 - unsigned int sq_entries; 71 69 int sq_pid = -1, sq_cpu = -1; 72 70 u64 sq_total_time = 0, sq_work_time = 0; 73 71 unsigned int i; ··· 89 89 seq_printf(m, "CqTail:\t%u\n", cq_tail); 90 90 seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); 91 91 seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); 92 - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); 93 - for (i = 0; i < sq_entries; i++) { 94 - unsigned int entry = i + sq_head; 92 + while (sq_head < sq_tail) { 95 93 struct io_uring_sqe *sqe; 96 94 unsigned int sq_idx; 95 + bool sqe128 = false; 96 + u8 opcode; 97 97 98 98 if (ctx->flags & IORING_SETUP_NO_SQARRAY) 99 99 break; 100 - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 100 + sq_idx = READ_ONCE(ctx->sq_array[sq_head & sq_mask]); 101 101 if (sq_idx > sq_mask) 102 102 continue; 103 + 103 104 sqe = &ctx->sq_sqes[sq_idx << sq_shift]; 105 + opcode = READ_ONCE(sqe->opcode); 106 + if (sq_shift) { 107 + sqe128 = true; 108 + } else if (io_issue_defs[opcode].is_128) { 109 + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { 110 + seq_printf(m, 111 + "%5u: invalid sqe, 128B entry on non-mixed sq\n", 112 + sq_idx); 113 + break; 114 + } 115 + if ((++sq_head & sq_mask) == 0) { 116 + seq_printf(m, 117 + "%5u: corrupted sqe, wrapping 128B entry\n", 118 + sq_idx); 119 + break; 120 + } 121 + sqe128 = true; 122 + } 104 123 seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " 105 124 "addr:0x%llx, rw_flags:0x%x, buf_index:%d " 106 125 "user_data:%llu", 107 - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, 126 + sq_idx, io_uring_get_opcode(opcode), sqe->fd, 108 127 sqe->flags, (unsigned long long) sqe->off, 109 128 (unsigned long long) sqe->addr, sqe->rw_flags, 110 129 sqe->buf_index, sqe->user_data); 111 - if (sq_shift) { 130 + if (sqe128) { 112 131 u64 *sqeb = (void *) (sqe + 1); 113 132 int size = sizeof(struct io_uring_sqe) / sizeof(u64); 114 133 int j; ··· 139 120 } 140 121 } 141 122 seq_printf(m, "\n"); 123 + sq_head++; 142 124 } 143 125 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); 144 126 while (cq_head < cq_tail) {
+33 -4
io_uring/io_uring.c
··· 2164 2164 } 2165 2165 2166 2166 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 2167 - const struct io_uring_sqe *sqe) 2167 + const struct io_uring_sqe *sqe, unsigned int *left) 2168 2168 __must_hold(&ctx->uring_lock) 2169 2169 { 2170 2170 const struct io_issue_def *def; ··· 2190 2190 opcode = array_index_nospec(opcode, IORING_OP_LAST); 2191 2191 2192 2192 def = &io_issue_defs[opcode]; 2193 + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { 2194 + /* 2195 + * A 128b op on a non-128b SQ requires mixed SQE support as 2196 + * well as 2 contiguous entries. 2197 + */ 2198 + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || 2199 + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) 2200 + return io_init_fail_req(req, -EINVAL); 2201 + /* 2202 + * A 128b operation on a mixed SQ uses two entries, so we have 2203 + * to increment the head and cached refs, and decrement what's 2204 + * left. 2205 + */ 2206 + current->io_uring->cached_refs++; 2207 + ctx->cached_sq_head++; 2208 + (*left)--; 2209 + } 2210 + 2193 2211 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 2194 2212 /* enforce forwards compatibility on users */ 2195 2213 if (sqe_flags & ~SQE_VALID_FLAGS) ··· 2317 2299 } 2318 2300 2319 2301 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 2320 - const struct io_uring_sqe *sqe) 2302 + const struct io_uring_sqe *sqe, unsigned int *left) 2321 2303 __must_hold(&ctx->uring_lock) 2322 2304 { 2323 2305 struct io_submit_link *link = &ctx->submit_state.link; 2324 2306 int ret; 2325 2307 2326 - ret = io_init_req(ctx, req, sqe); 2308 + ret = io_init_req(ctx, req, sqe, left); 2327 2309 if (unlikely(ret)) 2328 2310 return io_submit_fail_init(sqe, req, ret); 2329 2311 ··· 2475 2457 * Continue submitting even for sqe failure if the 2476 2458 * ring was setup with IORING_SETUP_SUBMIT_ALL 2477 2459 */ 2478 - if (unlikely(io_submit_sqe(ctx, req, sqe)) && 2460 + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && 2479 2461 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { 2480 2462 left--; 2481 2463 break; ··· 2818 2800 } 2819 2801 if (flags & IORING_SETUP_CQE_MIXED) { 2820 2802 if (cq_entries < 2) 2803 + return SIZE_MAX; 2804 + } 2805 + if (flags & IORING_SETUP_SQE_MIXED) { 2806 + if (sq_entries < 2) 2821 2807 return SIZE_MAX; 2822 2808 } 2823 2809 ··· 3747 3725 */ 3748 3726 if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3749 3727 (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3728 + return -EINVAL; 3729 + /* 3730 + * Nonsensical to ask for SQE128 and mixed SQE support, it's not 3731 + * supported to post 64b SQEs on a ring setup with SQE128. 3732 + */ 3733 + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == 3734 + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) 3750 3735 return -EINVAL; 3751 3736 3752 3737 return 0;
+2 -12
io_uring/io_uring.h
··· 54 54 IORING_SETUP_REGISTERED_FD_ONLY |\ 55 55 IORING_SETUP_NO_SQARRAY |\ 56 56 IORING_SETUP_HYBRID_IOPOLL |\ 57 - IORING_SETUP_CQE_MIXED) 57 + IORING_SETUP_CQE_MIXED |\ 58 + IORING_SETUP_SQE_MIXED) 58 59 59 60 #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ 60 61 IORING_ENTER_SQ_WAKEUP |\ ··· 564 563 io_req_set_res(req, res, 0); 565 564 req->io_task_work.func = io_req_task_complete; 566 565 io_req_task_work_add(req); 567 - } 568 - 569 - /* 570 - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 571 - * slot. 572 - */ 573 - static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) 574 - { 575 - if (ctx->flags & IORING_SETUP_SQE128) 576 - return 2 * sizeof(struct io_uring_sqe); 577 - return sizeof(struct io_uring_sqe); 578 566 } 579 567 580 568 static inline bool io_file_can_poll(struct io_kiocb *req)
+26
io_uring/opdef.c
··· 575 575 .prep = io_pipe_prep, 576 576 .issue = io_pipe, 577 577 }, 578 + [IORING_OP_NOP128] = { 579 + .audit_skip = 1, 580 + .iopoll = 1, 581 + .is_128 = 1, 582 + .prep = io_nop_prep, 583 + .issue = io_nop, 584 + }, 585 + [IORING_OP_URING_CMD128] = { 586 + .buffer_select = 1, 587 + .needs_file = 1, 588 + .plug = 1, 589 + .iopoll = 1, 590 + .iopoll_queue = 1, 591 + .is_128 = 1, 592 + .async_size = sizeof(struct io_async_cmd), 593 + .prep = io_uring_cmd_prep, 594 + .issue = io_uring_cmd, 595 + }, 578 596 }; 579 597 580 598 const struct io_cold_def io_cold_defs[] = { ··· 842 824 }, 843 825 [IORING_OP_PIPE] = { 844 826 .name = "PIPE", 827 + }, 828 + [IORING_OP_NOP128] = { 829 + .name = "NOP128", 830 + }, 831 + [IORING_OP_URING_CMD128] = { 832 + .name = "URING_CMD128", 833 + .sqe_copy = io_uring_cmd_sqe_copy, 834 + .cleanup = io_uring_cmd_cleanup, 845 835 }, 846 836 }; 847 837
+2
io_uring/opdef.h
··· 27 27 unsigned iopoll_queue : 1; 28 28 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ 29 29 unsigned vectored : 1; 30 + /* set to 1 if this opcode uses 128b sqes in a mixed sq */ 31 + unsigned is_128 : 1; 30 32 31 33 /* size of async data needed, if any */ 32 34 unsigned short async_size;
+1 -1
io_uring/register.c
··· 394 394 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 395 395 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 396 396 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 397 - IORING_SETUP_CQE_MIXED) 397 + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) 398 398 399 399 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 400 400 {
+15 -2
io_uring/uring_cmd.c
··· 216 216 return 0; 217 217 } 218 218 219 + /* 220 + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 221 + * slot. 222 + */ 223 + static inline size_t uring_sqe_size(struct io_kiocb *req) 224 + { 225 + if (req->ctx->flags & IORING_SETUP_SQE128 || 226 + req->opcode == IORING_OP_URING_CMD128) 227 + return 2 * sizeof(struct io_uring_sqe); 228 + return sizeof(struct io_uring_sqe); 229 + } 230 + 219 231 void io_uring_cmd_sqe_copy(struct io_kiocb *req) 220 232 { 221 233 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); ··· 236 224 /* Should not happen, as REQ_F_SQE_COPIED covers this */ 237 225 if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) 238 226 return; 239 - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); 227 + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req)); 240 228 ioucmd->sqe = ac->sqes; 241 229 } 242 230 ··· 254 242 if (ret) 255 243 return ret; 256 244 257 - if (ctx->flags & IORING_SETUP_SQE128) 245 + if (ctx->flags & IORING_SETUP_SQE128 || 246 + req->opcode == IORING_OP_URING_CMD128) 258 247 issue_flags |= IO_URING_F_SQE128; 259 248 if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) 260 249 issue_flags |= IO_URING_F_CQE32;