Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: introduce io_uring querying

There are many parameters users might want to query about io_uring like
available request types or the ring sizes. This patch introduces an
interface for such slow path queries.

It was written with several requirements in mind:
- Can be used with or without an io_uring instance. Asking for supported
setup flags before creating an instance as well as qeurying info about
an already created ring are valid use cases.
- Should be moderately fast. For example, users might use it to
periodically retrieve ring attributes at runtime. As a consequence,
it should be able to query multiple attributes in a single syscall.
- Backward and forward compatible.
- Should be reasobably easy to use.
- Reduce the kernel code size for introducing new query types.

It's implemented as a new registration opcode IORING_REGISTER_QUERY.
The user passes one or more query strutctures linked together, each
represented by struct io_uring_query_hdr. The header stores common
control fields needed for processing and points to query type specific
information.

The header contains
- The query type
- The result field, which on return contains the error code for the query
- Pointer to the query type specific information
- The size of the query structure. The kernel will only populate up to
the size, which helps with backward compatibility. The kernel can also
reduce the size, so if the current kernel is older than the inteface
the user tries to use, it'll get only the supported bits.
- next_entry field is used to chain multiple queries.

Apart from common registeration syscall failures, it can only immediately
return an error code in case when the headers are incorrect or any
other addresses and invalid. That usually mean that the userspace
doesn't use the API right and should be corrected. All query type
specific errors are returned in the header's result field.

As an example, the patch adds a single query type for now, i.e.
IO_URING_QUERY_OPCODES, which tells what register / request / etc.
opcodes are supported, but there are particular plans to extend it.

Note: there is a request probing interface via IORING_REGISTER_PROBE,
but it's a mess. It requires the user to create a ring first, it only
works for requests, and requires dynamic allocations.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Pavel Begunkov and committed by
Jens Axboe
c265ae75 63805d0a

+153 -1
+3
include/uapi/linux/io_uring.h
··· 686 686 687 687 IORING_REGISTER_MEM_REGION = 34, 688 688 689 + /* query various aspects of io_uring, see linux/io_uring/query.h */ 690 + IORING_REGISTER_QUERY = 35, 691 + 689 692 /* this goes last */ 690 693 IORING_REGISTER_LAST, 691 694
+41
include/uapi/linux/io_uring/query.h
··· 1 + /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 + /* 3 + * Header file for the io_uring query interface. 4 + */ 5 + #ifndef LINUX_IO_URING_QUERY_H 6 + #define LINUX_IO_URING_QUERY_H 7 + 8 + #include <linux/types.h> 9 + 10 + struct io_uring_query_hdr { 11 + __u64 next_entry; 12 + __u64 query_data; 13 + __u32 query_op; 14 + __u32 size; 15 + __s32 result; 16 + __u32 __resv[3]; 17 + }; 18 + 19 + enum { 20 + IO_URING_QUERY_OPCODES = 0, 21 + 22 + __IO_URING_QUERY_MAX, 23 + }; 24 + 25 + /* Doesn't require a ring */ 26 + struct io_uring_query_opcode { 27 + /* The number of supported IORING_OP_* opcodes */ 28 + __u32 nr_request_opcodes; 29 + /* The number of supported IORING_[UN]REGISTER_* opcodes */ 30 + __u32 nr_register_opcodes; 31 + /* Bitmask of all supported IORING_FEAT_* flags */ 32 + __u64 feature_flags; 33 + /* Bitmask of all supported IORING_SETUP_* flags */ 34 + __u64 ring_setup_flags; 35 + /* Bitmask of all supported IORING_ENTER_** flags */ 36 + __u64 enter_flags; 37 + /* Bitmask of all supported IOSQE_* flags */ 38 + __u64 sqe_flags; 39 + }; 40 + 41 + #endif
+1 -1
io_uring/Makefile
··· 13 13 sync.o msg_ring.o advise.o openclose.o \ 14 14 statx.o timeout.o cancel.o \ 15 15 waitid.o register.o truncate.o \ 16 - memmap.o alloc_cache.o 16 + memmap.o alloc_cache.o query.o 17 17 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 18 18 obj-$(CONFIG_IO_WQ) += io-wq.o 19 19 obj-$(CONFIG_FUTEX) += futex.o
+93
io_uring/query.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "linux/io_uring/query.h" 4 + 5 + #include "query.h" 6 + #include "io_uring.h" 7 + 8 + #define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) 9 + 10 + static ssize_t io_query_ops(void *data) 11 + { 12 + struct io_uring_query_opcode *e = data; 13 + 14 + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); 15 + 16 + e->nr_request_opcodes = IORING_OP_LAST; 17 + e->nr_register_opcodes = IORING_REGISTER_LAST; 18 + e->feature_flags = IORING_FEAT_FLAGS; 19 + e->ring_setup_flags = IORING_SETUP_FLAGS; 20 + e->enter_flags = IORING_ENTER_FLAGS; 21 + e->sqe_flags = SQE_VALID_FLAGS; 22 + return sizeof(*e); 23 + } 24 + 25 + static int io_handle_query_entry(struct io_ring_ctx *ctx, 26 + void *data, void __user *uhdr, 27 + u64 *next_entry) 28 + { 29 + struct io_uring_query_hdr hdr; 30 + size_t usize, res_size = 0; 31 + ssize_t ret = -EINVAL; 32 + void __user *udata; 33 + 34 + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) 35 + return -EFAULT; 36 + usize = hdr.size; 37 + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); 38 + udata = u64_to_user_ptr(hdr.query_data); 39 + 40 + if (hdr.query_op >= __IO_URING_QUERY_MAX) { 41 + ret = -EOPNOTSUPP; 42 + goto out; 43 + } 44 + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) 45 + goto out; 46 + if (copy_from_user(data, udata, hdr.size)) 47 + return -EFAULT; 48 + 49 + switch (hdr.query_op) { 50 + case IO_URING_QUERY_OPCODES: 51 + ret = io_query_ops(data); 52 + break; 53 + } 54 + 55 + if (ret >= 0) { 56 + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) 57 + return -EFAULT; 58 + res_size = ret; 59 + ret = 0; 60 + } 61 + out: 62 + hdr.result = ret; 63 + hdr.size = min_t(size_t, usize, res_size); 64 + 65 + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) 66 + return -EFAULT; 67 + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) 68 + return -EFAULT; 69 + *next_entry = hdr.next_entry; 70 + return 0; 71 + } 72 + 73 + int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 74 + { 75 + char entry_buffer[IO_MAX_QUERY_SIZE]; 76 + void __user *uhdr = arg; 77 + int ret; 78 + 79 + memset(entry_buffer, 0, sizeof(entry_buffer)); 80 + 81 + if (nr_args) 82 + return -EINVAL; 83 + 84 + while (uhdr) { 85 + u64 next_hdr; 86 + 87 + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); 88 + if (ret) 89 + return ret; 90 + uhdr = u64_to_user_ptr(next_hdr); 91 + } 92 + return 0; 93 + }
+9
io_uring/query.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IORING_QUERY_H 3 + #define IORING_QUERY_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); 8 + 9 + #endif
+6
io_uring/register.c
··· 31 31 #include "msg_ring.h" 32 32 #include "memmap.h" 33 33 #include "zcrx.h" 34 + #include "query.h" 34 35 35 36 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 36 37 IORING_REGISTER_LAST + IORING_OP_LAST) ··· 833 832 break; 834 833 ret = io_register_mem_region(ctx, arg); 835 834 break; 835 + case IORING_REGISTER_QUERY: 836 + ret = io_query(ctx, arg, nr_args); 837 + break; 836 838 default: 837 839 ret = -EINVAL; 838 840 break; ··· 905 901 switch (opcode) { 906 902 case IORING_REGISTER_SEND_MSG_RING: 907 903 return io_uring_register_send_msg_ring(arg, nr_args); 904 + case IORING_REGISTER_QUERY: 905 + return io_query(NULL, arg, nr_args); 908 906 } 909 907 return -EINVAL; 910 908 }