Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add support for fixed wait regions

Generally applications have 1 or a few waits of waiting, yet they pass
in a struct io_uring_getevents_arg every time. This needs to get copied
and, in turn, the timeout value needs to get copied.

Rather than do this for every invocation, allow the application to
register a fixed set of wait regions that can simply be indexed when
asking the kernel to wait on events.

At ring setup time, the application can register a number of these wait
regions and initialize region/index 0 upfront:

struct io_uring_reg_wait *reg;

reg = io_uring_setup_reg_wait(ring, nr_regions, &ret);

/* set timeout and mark as set, sigmask/sigmask_sz as needed */
reg->ts.tv_sec = 0;
reg->ts.tv_nsec = 100000;
reg->flags = IORING_REG_WAIT_TS;

where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The
above initializes index 0, but 63 other regions can be initialized,
if needed. Now, instead of doing:

struct __kernel_timespec timeout = { .tv_nsec = 100000, };

io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL);

to wait for events for each submit_and_wait, or just wait, operation, it
can just reference the above region at offset 0 and do:

io_uring_submit_and_wait_reg(ring, &cqe, nr, 0);

to achieve the same goal of waiting 100usec without needing to copy
both struct io_uring_getevents_arg (24b) and struct __kernel_timeout
(16b) for each invocation. Struct io_uring_reg_wait looks as follows:

struct io_uring_reg_wait {
struct __kernel_timespec ts;
__u32 min_wait_usec;
__u32 flags;
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad[3];
__u64 pad2[2];
};

embedding the timeout itself in the region, rather than passing it as
a pointer as well. Note that the signal mask is still passed as a
pointer, both for compatability reasons, but also because there doesn't
seem to be a lot of high frequency waits scenarios that involve setting
and resetting the signal mask for each wait.

The application is free to modify any region before a wait call, or it
can use keep multiple regions with different settings to avoid needing to
modify the same one for wait calls. Up to a page size of regions is mapped
by default, allowing PAGE_SIZE / 64 available regions for use.

The registered region must fit within a page. On a 4kb page size system,
that allows for 64 wait regions if a full page is used, as the size of
struct io_uring_reg_wait is 64b. The region registered must be aligned
to io_uring_reg_wait in size. It's valid to register less than 64
entries.

In network performance testing with zero-copy, this reduced the time
spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4%
to 0.3%.

Wait regions are fixed for the lifetime of the ring - once registered,
they are persistent until the ring is torn down. The regions support
minimum wait timeout as well as the regular waits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+192 -12
+10
include/linux/io_uring_types.h
··· 327 327 atomic_t cq_wait_nr; 328 328 atomic_t cq_timeouts; 329 329 struct wait_queue_head cq_wait; 330 + 331 + /* 332 + * If registered with IORING_REGISTER_CQWAIT_REG, a single 333 + * page holds N entries, mapped in cq_wait_arg. cq_wait_index 334 + * is the maximum allowable index. 335 + */ 336 + struct io_uring_reg_wait *cq_wait_arg; 337 + unsigned char cq_wait_index; 330 338 } ____cacheline_aligned_in_smp; 331 339 332 340 /* timeouts */ ··· 438 430 unsigned short n_sqe_pages; 439 431 struct page **ring_pages; 440 432 struct page **sqe_pages; 433 + 434 + struct page **cq_wait_page; 441 435 }; 442 436 443 437 struct io_tw_state {
+41
include/uapi/linux/io_uring.h
··· 518 518 #define IORING_ENTER_EXT_ARG (1U << 3) 519 519 #define IORING_ENTER_REGISTERED_RING (1U << 4) 520 520 #define IORING_ENTER_ABS_TIMER (1U << 5) 521 + #define IORING_ENTER_EXT_ARG_REG (1U << 6) 521 522 522 523 /* 523 524 * Passed in for io_uring_setup(2). Copied back with updated info on success ··· 620 619 621 620 /* resize CQ ring */ 622 621 IORING_REGISTER_RESIZE_RINGS = 33, 622 + 623 + /* register fixed io_uring_reg_wait arguments */ 624 + IORING_REGISTER_CQWAIT_REG = 34, 623 625 624 626 /* this goes last */ 625 627 IORING_REGISTER_LAST, ··· 807 803 IORING_RESTRICTION_LAST 808 804 }; 809 805 806 + enum { 807 + IORING_REG_WAIT_TS = (1U << 0), 808 + }; 809 + 810 + /* 811 + * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of 812 + * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is 813 + * called rather than pass in a wait argument structure separately. 814 + */ 815 + struct io_uring_cqwait_reg_arg { 816 + __u32 flags; 817 + __u32 struct_size; 818 + __u32 nr_entries; 819 + __u32 pad; 820 + __u64 user_addr; 821 + __u64 pad2[3]; 822 + }; 823 + 824 + /* 825 + * Argument for io_uring_enter(2) with 826 + * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument 827 + * is an index into a previously registered fixed wait region described by 828 + * the below structure. 829 + */ 830 + struct io_uring_reg_wait { 831 + struct __kernel_timespec ts; 832 + __u32 min_wait_usec; 833 + __u32 flags; 834 + __u64 sigmask; 835 + __u32 sigmask_sz; 836 + __u32 pad[3]; 837 + __u64 pad2[2]; 838 + }; 839 + 840 + /* 841 + * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG 842 + */ 810 843 struct io_uring_getevents_arg { 811 844 __u64 sigmask; 812 845 __u32 sigmask_sz;
+58 -12
io_uring/io_uring.c
··· 2736 2736 io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); 2737 2737 io_futex_cache_free(ctx); 2738 2738 io_destroy_buffers(ctx); 2739 + io_unregister_cqwait_reg(ctx); 2739 2740 mutex_unlock(&ctx->uring_lock); 2740 2741 if (ctx->sq_creds) 2741 2742 put_cred(ctx->sq_creds); ··· 3225 3224 io_uring_cancel_generic(cancel_all, NULL); 3226 3225 } 3227 3226 3228 - static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) 3227 + static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, 3228 + const struct io_uring_getevents_arg __user *uarg) 3229 3229 { 3230 - if (flags & IORING_ENTER_EXT_ARG) { 3231 - struct io_uring_getevents_arg arg; 3230 + struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg); 3232 3231 3233 - if (argsz != sizeof(arg)) 3234 - return -EINVAL; 3235 - if (copy_from_user(&arg, argp, sizeof(arg))) 3236 - return -EFAULT; 3232 + if (arg) { 3233 + unsigned int index = (unsigned int) (uintptr_t) uarg; 3234 + 3235 + if (index <= ctx->cq_wait_index) 3236 + return arg + index; 3237 3237 } 3238 + 3239 + return ERR_PTR(-EFAULT); 3240 + } 3241 + 3242 + static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, 3243 + const void __user *argp, size_t argsz) 3244 + { 3245 + struct io_uring_getevents_arg arg; 3246 + 3247 + if (!(flags & IORING_ENTER_EXT_ARG)) 3248 + return 0; 3249 + 3250 + if (flags & IORING_ENTER_EXT_ARG_REG) { 3251 + if (argsz != sizeof(struct io_uring_reg_wait)) 3252 + return -EINVAL; 3253 + return PTR_ERR(io_get_ext_arg_reg(ctx, argp)); 3254 + } 3255 + if (argsz != sizeof(arg)) 3256 + return -EINVAL; 3257 + if (copy_from_user(&arg, argp, sizeof(arg))) 3258 + return -EFAULT; 3238 3259 return 0; 3239 3260 } 3240 3261 3241 - static int io_get_ext_arg(unsigned flags, const void __user *argp, 3242 - struct ext_arg *ext_arg) 3262 + static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, 3263 + const void __user *argp, struct ext_arg *ext_arg) 3243 3264 { 3244 3265 const struct io_uring_getevents_arg __user *uarg = argp; 3245 3266 struct io_uring_getevents_arg arg; ··· 3272 3249 */ 3273 3250 if (!(flags & IORING_ENTER_EXT_ARG)) { 3274 3251 ext_arg->sig = (const sigset_t __user *) argp; 3252 + return 0; 3253 + } 3254 + 3255 + if (flags & IORING_ENTER_EXT_ARG_REG) { 3256 + struct io_uring_reg_wait *w; 3257 + 3258 + if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) 3259 + return -EINVAL; 3260 + w = io_get_ext_arg_reg(ctx, argp); 3261 + if (IS_ERR(w)) 3262 + return PTR_ERR(w); 3263 + 3264 + if (w->flags & ~IORING_REG_WAIT_TS) 3265 + return -EINVAL; 3266 + ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; 3267 + ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); 3268 + ext_arg->argsz = READ_ONCE(w->sigmask_sz); 3269 + if (w->flags & IORING_REG_WAIT_TS) { 3270 + ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); 3271 + ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); 3272 + ext_arg->ts_set = true; 3273 + } 3275 3274 return 0; 3276 3275 } 3277 3276 ··· 3342 3297 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 3343 3298 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 3344 3299 IORING_ENTER_REGISTERED_RING | 3345 - IORING_ENTER_ABS_TIMER))) 3300 + IORING_ENTER_ABS_TIMER | 3301 + IORING_ENTER_EXT_ARG_REG))) 3346 3302 return -EINVAL; 3347 3303 3348 3304 /* ··· 3426 3380 */ 3427 3381 mutex_lock(&ctx->uring_lock); 3428 3382 iopoll_locked: 3429 - ret2 = io_validate_ext_arg(flags, argp, argsz); 3383 + ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); 3430 3384 if (likely(!ret2)) { 3431 3385 min_complete = min(min_complete, 3432 3386 ctx->cq_entries); ··· 3436 3390 } else { 3437 3391 struct ext_arg ext_arg = { .argsz = argsz }; 3438 3392 3439 - ret2 = io_get_ext_arg(flags, argp, &ext_arg); 3393 + ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); 3440 3394 if (likely(!ret2)) { 3441 3395 min_complete = min(min_complete, 3442 3396 ctx->cq_entries);
+82
io_uring/register.c
··· 570 570 return ret; 571 571 } 572 572 573 + void io_unregister_cqwait_reg(struct io_ring_ctx *ctx) 574 + { 575 + unsigned short npages = 1; 576 + 577 + if (!ctx->cq_wait_page) 578 + return; 579 + 580 + io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true); 581 + ctx->cq_wait_arg = NULL; 582 + if (ctx->user) 583 + __io_unaccount_mem(ctx->user, 1); 584 + } 585 + 586 + /* 587 + * Register a page holding N entries of struct io_uring_reg_wait, which can 588 + * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set. 589 + * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing 590 + * in a pointer for a struct io_uring_getevents_arg, an index into this 591 + * registered array is passed, avoiding two (arg + timeout) copies per 592 + * invocation. 593 + */ 594 + static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg) 595 + { 596 + struct io_uring_cqwait_reg_arg arg; 597 + struct io_uring_reg_wait *reg; 598 + struct page **pages; 599 + unsigned long len; 600 + int nr_pages, poff; 601 + int ret; 602 + 603 + if (ctx->cq_wait_page || ctx->cq_wait_arg) 604 + return -EBUSY; 605 + if (copy_from_user(&arg, uarg, sizeof(arg))) 606 + return -EFAULT; 607 + if (!arg.nr_entries || arg.flags) 608 + return -EINVAL; 609 + if (arg.struct_size != sizeof(*reg)) 610 + return -EINVAL; 611 + if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len)) 612 + return -EOVERFLOW; 613 + if (len > PAGE_SIZE) 614 + return -EINVAL; 615 + /* offset + len must fit within a page, and must be reg_wait aligned */ 616 + poff = arg.user_addr & ~PAGE_MASK; 617 + if (len + poff > PAGE_SIZE) 618 + return -EINVAL; 619 + if (poff % arg.struct_size) 620 + return -EINVAL; 621 + 622 + pages = io_pin_pages(arg.user_addr, len, &nr_pages); 623 + if (IS_ERR(pages)) 624 + return PTR_ERR(pages); 625 + ret = -EINVAL; 626 + if (nr_pages != 1) 627 + goto out_free; 628 + if (ctx->user) { 629 + ret = __io_account_mem(ctx->user, 1); 630 + if (ret) 631 + goto out_free; 632 + } 633 + 634 + reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL); 635 + if (reg) { 636 + ctx->cq_wait_index = arg.nr_entries - 1; 637 + WRITE_ONCE(ctx->cq_wait_page, pages); 638 + WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff); 639 + return 0; 640 + } 641 + ret = -ENOMEM; 642 + if (ctx->user) 643 + __io_unaccount_mem(ctx->user, 1); 644 + out_free: 645 + io_pages_free(&pages, nr_pages); 646 + return ret; 647 + } 648 + 573 649 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 574 650 void __user *arg, unsigned nr_args) 575 651 __releases(ctx->uring_lock) ··· 839 763 if (!arg || nr_args != 1) 840 764 break; 841 765 ret = io_register_resize_rings(ctx, arg); 766 + break; 767 + case IORING_REGISTER_CQWAIT_REG: 768 + ret = -EINVAL; 769 + if (!arg || nr_args != 1) 770 + break; 771 + ret = io_register_cqwait_reg(ctx, arg); 842 772 break; 843 773 default: 844 774 ret = -EINVAL;
+1
io_uring/register.h
··· 5 5 int io_eventfd_unregister(struct io_ring_ctx *ctx); 6 6 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); 7 7 struct file *io_uring_register_get_file(unsigned int fd, bool registered); 8 + void io_unregister_cqwait_reg(struct io_ring_ctx *ctx); 8 9 9 10 #endif