io_uring: add support for IORING_SETUP_CQE_MIXED

+6

include/uapi/linux/io_uring.h

··· 225 225 /* Use hybrid poll in iopoll process */ 226 226 #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) 227 227 228 + /* 229 + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have 230 + * IORING_CQE_F_32 set in cqe->flags. 231 + */ 232 + #define IORING_SETUP_CQE_MIXED (1U << 18) 233 + 228 234 enum io_uring_op { 229 235 IORING_OP_NOP, 230 236 IORING_OP_READV,

+62 -16

io_uring/io_uring.c

··· 620 620 621 621 static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) 622 622 { 623 - size_t cqe_size = sizeof(struct io_uring_cqe); 624 - 625 623 lockdep_assert_held(&ctx->uring_lock); 626 624 627 625 /* don't abort if we're dying, entries must get freed */ 628 626 if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) 629 627 return; 630 628 631 - if (ctx->flags & IORING_SETUP_CQE32) 632 - cqe_size <<= 1; 633 - 634 629 io_cq_lock(ctx); 635 630 while (!list_empty(&ctx->cq_overflow_list)) { 631 + size_t cqe_size = sizeof(struct io_uring_cqe); 636 632 struct io_uring_cqe *cqe; 637 633 struct io_overflow_cqe *ocqe; 634 + bool is_cqe32 = false; 638 635 639 636 ocqe = list_first_entry(&ctx->cq_overflow_list, 640 637 struct io_overflow_cqe, list); 638 + if (ocqe->cqe.flags & IORING_CQE_F_32 || 639 + ctx->flags & IORING_SETUP_CQE32) { 640 + is_cqe32 = true; 641 + cqe_size <<= 1; 642 + } 641 643 642 644 if (!dying) { 643 - if (!io_get_cqe_overflow(ctx, &cqe, true)) 645 + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) 644 646 break; 645 647 memcpy(cqe, &ocqe->cqe, cqe_size); 646 648 } ··· 754 752 { 755 753 struct io_overflow_cqe *ocqe; 756 754 size_t ocq_size = sizeof(struct io_overflow_cqe); 757 - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); 755 + bool is_cqe32 = false; 758 756 759 - if (is_cqe32) 760 - ocq_size += sizeof(struct io_uring_cqe); 757 + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { 758 + is_cqe32 = true; 759 + ocq_size <<= 1; 760 + } 761 761 762 762 ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); 763 763 trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); ··· 778 774 } 779 775 780 776 /* 777 + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE 778 + * because the ring is a single 16b entry away from wrapping. 779 + */ 780 + static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) 781 + { 782 + if (__io_cqring_events(ctx) < ctx->cq_entries) { 783 + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; 784 + 785 + cqe->user_data = 0; 786 + cqe->res = 0; 787 + cqe->flags = IORING_CQE_F_SKIP; 788 + ctx->cached_cq_tail++; 789 + return true; 790 + } 791 + return false; 792 + } 793 + 794 + /* 781 795 * writes to the cq entry need to come after reading head; the 782 796 * control dependency is enough as we're using WRITE_ONCE to 783 797 * fill the cq entry 784 798 */ 785 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 799 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) 786 800 { 787 801 struct io_rings *rings = ctx->rings; 788 802 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); ··· 814 792 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 815 793 return false; 816 794 795 + /* 796 + * Post dummy CQE if a 32b CQE is needed and there's only room for a 797 + * 16b CQE before the ring wraps. 798 + */ 799 + if (cqe32 && off + 1 == ctx->cq_entries) { 800 + if (!io_fill_nop_cqe(ctx, off)) 801 + return false; 802 + off = 0; 803 + } 804 + 817 805 /* userspace may cheat modifying the tail, be safe and do min */ 818 806 queued = min(__io_cqring_events(ctx), ctx->cq_entries); 819 807 free = ctx->cq_entries - queued; 820 808 /* we need a contiguous range, limit based on the current array offset */ 821 809 len = min(free, ctx->cq_entries - off); 822 - if (!len) 810 + if (len < (cqe32 + 1)) 823 811 return false; 824 812 825 813 if (ctx->flags & IORING_SETUP_CQE32) { ··· 847 815 { 848 816 struct io_uring_cqe *cqe; 849 817 850 - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) 818 + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) 851 819 return false; 852 - if (unlikely(!io_get_cqe(ctx, &cqe))) 820 + if (unlikely(!io_get_cqe(ctx, &cqe, true))) 853 821 return false; 854 822 855 823 memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); ··· 860 828 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, 861 829 u32 cflags) 862 830 { 831 + bool cqe32 = cflags & IORING_CQE_F_32; 863 832 struct io_uring_cqe *cqe; 864 833 865 - if (likely(io_get_cqe(ctx, &cqe))) { 834 + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { 866 835 WRITE_ONCE(cqe->user_data, user_data); 867 836 WRITE_ONCE(cqe->res, res); 868 837 WRITE_ONCE(cqe->flags, cflags); 869 838 870 - if (ctx->flags & IORING_SETUP_CQE32) { 839 + if (cqe32) { 871 840 WRITE_ONCE(cqe->big_cqe[0], 0); 872 841 WRITE_ONCE(cqe->big_cqe[1], 0); 873 842 } ··· 2789 2756 if (check_shl_overflow(off, 1, &off)) 2790 2757 return SIZE_MAX; 2791 2758 } 2759 + if (flags & IORING_SETUP_CQE_MIXED) { 2760 + if (cq_entries < 2) 2761 + return SIZE_MAX; 2762 + } 2792 2763 2793 2764 #ifdef CONFIG_SMP 2794 2765 off = ALIGN(off, SMP_CACHE_BYTES); ··· 3717 3680 !(flags & IORING_SETUP_SINGLE_ISSUER)) 3718 3681 return -EINVAL; 3719 3682 3683 + /* 3684 + * Nonsensical to ask for CQE32 and mixed CQE support, it's not 3685 + * supported to post 16b CQEs on a ring setup with CQE32. 3686 + */ 3687 + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3688 + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3689 + return -EINVAL; 3690 + 3720 3691 return 0; 3721 3692 } 3722 3693 ··· 3951 3906 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 3952 3907 IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 3953 3908 IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 3954 - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) 3909 + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | 3910 + IORING_SETUP_CQE_MIXED)) 3955 3911 return -EINVAL; 3956 3912 3957 3913 return io_uring_create(entries, &p, params);

+35 -14

io_uring/io_uring.h

··· 75 75 unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 76 76 unsigned int cq_entries, size_t *sq_offset); 77 77 int io_uring_fill_params(unsigned entries, struct io_uring_params *p); 78 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 78 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 79 79 int io_run_task_work_sig(struct io_ring_ctx *ctx); 80 80 void io_req_defer_failed(struct io_kiocb *req, s32 res); 81 81 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 169 169 170 170 static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 171 171 struct io_uring_cqe **ret, 172 - bool overflow) 172 + bool overflow, bool cqe32) 173 173 { 174 174 io_lockdep_assert_cq_locked(ctx); 175 175 176 - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 177 - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 176 + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { 177 + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) 178 178 return false; 179 179 } 180 180 *ret = ctx->cqe_cached; 181 181 ctx->cached_cq_tail++; 182 182 ctx->cqe_cached++; 183 - if (ctx->flags & IORING_SETUP_CQE32) 183 + if (ctx->flags & IORING_SETUP_CQE32) { 184 184 ctx->cqe_cached++; 185 + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { 186 + ctx->cqe_cached++; 187 + ctx->cached_cq_tail++; 188 + } 189 + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); 185 190 return true; 186 191 } 187 192 188 - static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 193 + static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, 194 + bool cqe32) 189 195 { 190 - return io_get_cqe_overflow(ctx, ret, false); 196 + return io_get_cqe_overflow(ctx, ret, false, cqe32); 191 197 } 192 198 193 199 static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, ··· 202 196 io_lockdep_assert_cq_locked(ctx); 203 197 204 198 ctx->submit_state.cq_flush = true; 205 - return io_get_cqe(ctx, cqe_ret); 199 + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); 206 200 } 207 201 208 202 static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 209 203 struct io_kiocb *req) 210 204 { 205 + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; 211 206 struct io_uring_cqe *cqe; 212 207 213 208 /* 214 - * If we can't get a cq entry, userspace overflowed the 215 - * submission (by quite a lot). Increment the overflow count in 216 - * the ring. 209 + * If we can't get a cq entry, userspace overflowed the submission 210 + * (by quite a lot). 217 211 */ 218 - if (unlikely(!io_get_cqe(ctx, &cqe))) 212 + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) 219 213 return false; 220 214 221 - 222 215 memcpy(cqe, &req->cqe, sizeof(*cqe)); 223 - if (ctx->flags & IORING_SETUP_CQE32) { 216 + if (is_cqe32) { 224 217 memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 225 218 memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 226 219 } ··· 242 237 { 243 238 req->cqe.res = res; 244 239 req->cqe.flags = cflags; 240 + } 241 + 242 + static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) 243 + { 244 + if (ctx->flags & IORING_SETUP_CQE_MIXED) 245 + return IORING_CQE_F_32; 246 + return 0; 247 + } 248 + 249 + static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, 250 + __u64 extra1, __u64 extra2) 251 + { 252 + req->cqe.res = res; 253 + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); 254 + req->big_cqe.extra1 = extra1; 255 + req->big_cqe.extra2 = extra2; 245 256 } 246 257 247 258 static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,

+2 -1

io_uring/register.c

··· 396 396 397 397 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 398 398 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 399 - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 399 + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 400 + IORING_SETUP_CQE_MIXED) 400 401 401 402 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 402 403 {

Configure Feed

Configure Feed