Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add support for IORING_SETUP_CQE_MIXED

Normal rings support 16b CQEs for posting completions, while certain
features require the ring to be configured with IORING_SETUP_CQE32, as
they need to convey more information per completion. This, in turn,
makes ALL the CQEs be 32b in size. This is somewhat wasteful and
inefficient, particularly when only certain CQEs need to be of the
bigger variant.

This adds support for setting up a ring with mixed CQE sizes, using
IORING_SETUP_CQE_MIXED. When setup in this mode, CQEs posted to the ring
may be either 16b or 32b in size. If a CQE is 32b in size, then
IORING_CQE_F_32 is set in the CQE flags to indicate that this is the
case. If this flag isn't set, the CQE is the normal 16b variant.

CQEs on these types of mixed rings may also have IORING_CQE_F_SKIP set.
This can happen if the ring is one (small) CQE entry away from wrapping,
and an attempt is made to post a 32b CQE. As CQEs must be contigious in
the CQ ring, a 32b CQE cannot wrap the ring. For this case, a single
dummy CQE is posted with the SKIP flag set. The application should
simply ignore those.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+105 -31
+6
include/uapi/linux/io_uring.h
··· 225 225 /* Use hybrid poll in iopoll process */ 226 226 #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) 227 227 228 + /* 229 + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have 230 + * IORING_CQE_F_32 set in cqe->flags. 231 + */ 232 + #define IORING_SETUP_CQE_MIXED (1U << 18) 233 + 228 234 enum io_uring_op { 229 235 IORING_OP_NOP, 230 236 IORING_OP_READV,
+62 -16
io_uring/io_uring.c
··· 620 620 621 621 static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) 622 622 { 623 - size_t cqe_size = sizeof(struct io_uring_cqe); 624 - 625 623 lockdep_assert_held(&ctx->uring_lock); 626 624 627 625 /* don't abort if we're dying, entries must get freed */ 628 626 if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) 629 627 return; 630 628 631 - if (ctx->flags & IORING_SETUP_CQE32) 632 - cqe_size <<= 1; 633 - 634 629 io_cq_lock(ctx); 635 630 while (!list_empty(&ctx->cq_overflow_list)) { 631 + size_t cqe_size = sizeof(struct io_uring_cqe); 636 632 struct io_uring_cqe *cqe; 637 633 struct io_overflow_cqe *ocqe; 634 + bool is_cqe32 = false; 638 635 639 636 ocqe = list_first_entry(&ctx->cq_overflow_list, 640 637 struct io_overflow_cqe, list); 638 + if (ocqe->cqe.flags & IORING_CQE_F_32 || 639 + ctx->flags & IORING_SETUP_CQE32) { 640 + is_cqe32 = true; 641 + cqe_size <<= 1; 642 + } 641 643 642 644 if (!dying) { 643 - if (!io_get_cqe_overflow(ctx, &cqe, true)) 645 + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) 644 646 break; 645 647 memcpy(cqe, &ocqe->cqe, cqe_size); 646 648 } ··· 754 752 { 755 753 struct io_overflow_cqe *ocqe; 756 754 size_t ocq_size = sizeof(struct io_overflow_cqe); 757 - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); 755 + bool is_cqe32 = false; 758 756 759 - if (is_cqe32) 760 - ocq_size += sizeof(struct io_uring_cqe); 757 + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { 758 + is_cqe32 = true; 759 + ocq_size <<= 1; 760 + } 761 761 762 762 ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); 763 763 trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); ··· 778 774 } 779 775 780 776 /* 777 + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE 778 + * because the ring is a single 16b entry away from wrapping. 779 + */ 780 + static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) 781 + { 782 + if (__io_cqring_events(ctx) < ctx->cq_entries) { 783 + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; 784 + 785 + cqe->user_data = 0; 786 + cqe->res = 0; 787 + cqe->flags = IORING_CQE_F_SKIP; 788 + ctx->cached_cq_tail++; 789 + return true; 790 + } 791 + return false; 792 + } 793 + 794 + /* 781 795 * writes to the cq entry need to come after reading head; the 782 796 * control dependency is enough as we're using WRITE_ONCE to 783 797 * fill the cq entry 784 798 */ 785 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 799 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) 786 800 { 787 801 struct io_rings *rings = ctx->rings; 788 802 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); ··· 814 792 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 815 793 return false; 816 794 795 + /* 796 + * Post dummy CQE if a 32b CQE is needed and there's only room for a 797 + * 16b CQE before the ring wraps. 798 + */ 799 + if (cqe32 && off + 1 == ctx->cq_entries) { 800 + if (!io_fill_nop_cqe(ctx, off)) 801 + return false; 802 + off = 0; 803 + } 804 + 817 805 /* userspace may cheat modifying the tail, be safe and do min */ 818 806 queued = min(__io_cqring_events(ctx), ctx->cq_entries); 819 807 free = ctx->cq_entries - queued; 820 808 /* we need a contiguous range, limit based on the current array offset */ 821 809 len = min(free, ctx->cq_entries - off); 822 - if (!len) 810 + if (len < (cqe32 + 1)) 823 811 return false; 824 812 825 813 if (ctx->flags & IORING_SETUP_CQE32) { ··· 847 815 { 848 816 struct io_uring_cqe *cqe; 849 817 850 - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) 818 + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) 851 819 return false; 852 - if (unlikely(!io_get_cqe(ctx, &cqe))) 820 + if (unlikely(!io_get_cqe(ctx, &cqe, true))) 853 821 return false; 854 822 855 823 memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); ··· 860 828 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, 861 829 u32 cflags) 862 830 { 831 + bool cqe32 = cflags & IORING_CQE_F_32; 863 832 struct io_uring_cqe *cqe; 864 833 865 - if (likely(io_get_cqe(ctx, &cqe))) { 834 + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { 866 835 WRITE_ONCE(cqe->user_data, user_data); 867 836 WRITE_ONCE(cqe->res, res); 868 837 WRITE_ONCE(cqe->flags, cflags); 869 838 870 - if (ctx->flags & IORING_SETUP_CQE32) { 839 + if (cqe32) { 871 840 WRITE_ONCE(cqe->big_cqe[0], 0); 872 841 WRITE_ONCE(cqe->big_cqe[1], 0); 873 842 } ··· 2789 2756 if (check_shl_overflow(off, 1, &off)) 2790 2757 return SIZE_MAX; 2791 2758 } 2759 + if (flags & IORING_SETUP_CQE_MIXED) { 2760 + if (cq_entries < 2) 2761 + return SIZE_MAX; 2762 + } 2792 2763 2793 2764 #ifdef CONFIG_SMP 2794 2765 off = ALIGN(off, SMP_CACHE_BYTES); ··· 3717 3680 !(flags & IORING_SETUP_SINGLE_ISSUER)) 3718 3681 return -EINVAL; 3719 3682 3683 + /* 3684 + * Nonsensical to ask for CQE32 and mixed CQE support, it's not 3685 + * supported to post 16b CQEs on a ring setup with CQE32. 3686 + */ 3687 + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3688 + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3689 + return -EINVAL; 3690 + 3720 3691 return 0; 3721 3692 } 3722 3693 ··· 3951 3906 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 3952 3907 IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 3953 3908 IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 3954 - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) 3909 + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | 3910 + IORING_SETUP_CQE_MIXED)) 3955 3911 return -EINVAL; 3956 3912 3957 3913 return io_uring_create(entries, &p, params);
+35 -14
io_uring/io_uring.h
··· 75 75 unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 76 76 unsigned int cq_entries, size_t *sq_offset); 77 77 int io_uring_fill_params(unsigned entries, struct io_uring_params *p); 78 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 78 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 79 79 int io_run_task_work_sig(struct io_ring_ctx *ctx); 80 80 void io_req_defer_failed(struct io_kiocb *req, s32 res); 81 81 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 169 169 170 170 static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 171 171 struct io_uring_cqe **ret, 172 - bool overflow) 172 + bool overflow, bool cqe32) 173 173 { 174 174 io_lockdep_assert_cq_locked(ctx); 175 175 176 - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 177 - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 176 + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { 177 + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) 178 178 return false; 179 179 } 180 180 *ret = ctx->cqe_cached; 181 181 ctx->cached_cq_tail++; 182 182 ctx->cqe_cached++; 183 - if (ctx->flags & IORING_SETUP_CQE32) 183 + if (ctx->flags & IORING_SETUP_CQE32) { 184 184 ctx->cqe_cached++; 185 + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { 186 + ctx->cqe_cached++; 187 + ctx->cached_cq_tail++; 188 + } 189 + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); 185 190 return true; 186 191 } 187 192 188 - static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 193 + static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, 194 + bool cqe32) 189 195 { 190 - return io_get_cqe_overflow(ctx, ret, false); 196 + return io_get_cqe_overflow(ctx, ret, false, cqe32); 191 197 } 192 198 193 199 static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, ··· 202 196 io_lockdep_assert_cq_locked(ctx); 203 197 204 198 ctx->submit_state.cq_flush = true; 205 - return io_get_cqe(ctx, cqe_ret); 199 + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); 206 200 } 207 201 208 202 static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 209 203 struct io_kiocb *req) 210 204 { 205 + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; 211 206 struct io_uring_cqe *cqe; 212 207 213 208 /* 214 - * If we can't get a cq entry, userspace overflowed the 215 - * submission (by quite a lot). Increment the overflow count in 216 - * the ring. 209 + * If we can't get a cq entry, userspace overflowed the submission 210 + * (by quite a lot). 217 211 */ 218 - if (unlikely(!io_get_cqe(ctx, &cqe))) 212 + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) 219 213 return false; 220 214 221 - 222 215 memcpy(cqe, &req->cqe, sizeof(*cqe)); 223 - if (ctx->flags & IORING_SETUP_CQE32) { 216 + if (is_cqe32) { 224 217 memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 225 218 memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 226 219 } ··· 242 237 { 243 238 req->cqe.res = res; 244 239 req->cqe.flags = cflags; 240 + } 241 + 242 + static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) 243 + { 244 + if (ctx->flags & IORING_SETUP_CQE_MIXED) 245 + return IORING_CQE_F_32; 246 + return 0; 247 + } 248 + 249 + static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, 250 + __u64 extra1, __u64 extra2) 251 + { 252 + req->cqe.res = res; 253 + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); 254 + req->big_cqe.extra1 = extra1; 255 + req->big_cqe.extra2 = extra2; 245 256 } 246 257 247 258 static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
+2 -1
io_uring/register.c
··· 396 396 397 397 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 398 398 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 399 - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 399 + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 400 + IORING_SETUP_CQE_MIXED) 400 401 401 402 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 402 403 {