Merge branch 'io_uring-6.18' into for-6.19/io_uring

+9

MAINTAINERS

··· 13111 13111 F: include/uapi/linux/io_uring/ 13112 13112 F: io_uring/ 13113 13113 13114 + IO_URING ZCRX 13115 + M: Pavel Begunkov <asml.silence@gmail.com> 13116 + L: io-uring@vger.kernel.org 13117 + L: netdev@vger.kernel.org 13118 + T: git https://github.com/isilence/linux.git zcrx/for-next 13119 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git 13120 + S: Maintained 13121 + F: io_uring/zcrx.* 13122 + 13114 13123 IPMI SUBSYSTEM 13115 13124 M: Corey Minyard <corey@minyard.net> 13116 13125 L: openipmi-developer@lists.sourceforge.net (moderated for non-subscribers)

-12

include/uapi/linux/io_uring.h

··· 697 697 /* query various aspects of io_uring, see linux/io_uring/query.h */ 698 698 IORING_REGISTER_QUERY = 35, 699 699 700 - /* return zcrx buffers back into circulation */ 701 - IORING_REGISTER_ZCRX_REFILL = 36, 702 - 703 700 /* this goes last */ 704 701 IORING_REGISTER_LAST, 705 702 ··· 1076 1079 __u32 zcrx_id; 1077 1080 __u32 __resv2; 1078 1081 __u64 __resv[3]; 1079 - }; 1080 - 1081 - struct io_uring_zcrx_sync_refill { 1082 - __u32 zcrx_id; 1083 - /* the number of entries to return */ 1084 - __u32 nr_entries; 1085 - /* pointer to an array of struct io_uring_zcrx_rqe */ 1086 - __u64 rqes; 1087 - __u64 __resv[2]; 1088 1082 }; 1089 1083 1090 1084 #ifdef __cplusplus

+3

include/uapi/linux/io_uring/query.h

··· 36 36 __u64 enter_flags; 37 37 /* Bitmask of all supported IOSQE_* flags */ 38 38 __u64 sqe_flags; 39 + /* The number of available query opcodes */ 40 + __u32 nr_query_opcodes; 41 + __u32 __pad; 39 42 }; 40 43 41 44 #endif

+4 -4

io_uring/fdinfo.c

··· 61 61 { 62 62 struct io_overflow_cqe *ocqe; 63 63 struct io_rings *r = ctx->rings; 64 - struct rusage sq_usage; 65 64 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; 66 65 unsigned int sq_head = READ_ONCE(r->sq.head); 67 66 unsigned int sq_tail = READ_ONCE(r->sq.tail); ··· 178 179 * thread termination. 179 180 */ 180 181 if (tsk) { 182 + u64 usec; 183 + 181 184 get_task_struct(tsk); 182 185 rcu_read_unlock(); 183 - getrusage(tsk, RUSAGE_SELF, &sq_usage); 186 + usec = io_sq_cpu_usec(tsk); 184 187 put_task_struct(tsk); 185 188 sq_pid = sq->task_pid; 186 189 sq_cpu = sq->sq_cpu; 187 - sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 188 - + sq_usage.ru_stime.tv_usec); 190 + sq_total_time = usec; 189 191 sq_work_time = sq->work_time; 190 192 } else { 191 193 rcu_read_unlock();

+1 -1

io_uring/filetable.c

··· 57 57 58 58 static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, 59 59 u32 slot_index) 60 - __must_hold(&req->ctx->uring_lock) 60 + __must_hold(&ctx->uring_lock) 61 61 { 62 62 struct io_rsrc_node *node; 63 63

+1 -1

io_uring/io_uring.c

··· 846 846 } 847 847 848 848 static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, 849 - struct io_big_cqe *big_cqe) 849 + struct io_big_cqe *big_cqe) 850 850 { 851 851 struct io_overflow_cqe *ocqe; 852 852

+22 -11

io_uring/kbuf.c

··· 155 155 return 1; 156 156 } 157 157 158 + static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags) 159 + { 160 + /* 161 + * If we came in unlocked, we have no choice but to consume the 162 + * buffer here, otherwise nothing ensures that the buffer won't 163 + * get used by others. This does mean it'll be pinned until the 164 + * IO completes, coming in unlocked means we're being called from 165 + * io-wq context and there may be further retries in async hybrid 166 + * mode. For the locked case, the caller must call commit when 167 + * the transfer completes (or if we get -EAGAIN and must poll of 168 + * retry). 169 + */ 170 + if (issue_flags & IO_URING_F_UNLOCKED) 171 + return true; 172 + 173 + /* uring_cmd commits kbuf upfront, no need to auto-commit */ 174 + if (!io_file_can_poll(req) && req->opcode != IORING_OP_URING_CMD) 175 + return true; 176 + return false; 177 + } 178 + 158 179 static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, 159 180 struct io_buffer_list *bl, 160 181 unsigned int issue_flags) ··· 202 181 sel.buf_list = bl; 203 182 sel.addr = u64_to_user_ptr(buf->addr); 204 183 205 - if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { 206 - /* 207 - * If we came in unlocked, we have no choice but to consume the 208 - * buffer here, otherwise nothing ensures that the buffer won't 209 - * get used by others. This does mean it'll be pinned until the 210 - * IO completes, coming in unlocked means we're being called from 211 - * io-wq context and there may be further retries in async hybrid 212 - * mode. For the locked case, the caller must call commit when 213 - * the transfer completes (or if we get -EAGAIN and must poll of 214 - * retry). 215 - */ 184 + if (io_should_commit(req, issue_flags)) { 216 185 io_kbuf_commit(req, sel.buf_list, *len, 1); 217 186 sel.buf_list = NULL; 218 187 }

+1 -1

io_uring/net.c

··· 383 383 return 0; 384 384 385 385 if (sr->flags & IORING_SEND_VECTORIZED) 386 - return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); 386 + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); 387 387 388 388 return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 389 389 }

+2

io_uring/query.c

··· 22 22 e->ring_setup_flags = IORING_SETUP_FLAGS; 23 23 e->enter_flags = IORING_ENTER_FLAGS; 24 24 e->sqe_flags = SQE_VALID_FLAGS; 25 + e->nr_query_opcodes = __IO_URING_QUERY_MAX; 26 + e->__pad = 0; 25 27 return sizeof(*e); 26 28 } 27 29

-3

io_uring/register.c

··· 826 826 case IORING_REGISTER_QUERY: 827 827 ret = io_query(ctx, arg, nr_args); 828 828 break; 829 - case IORING_REGISTER_ZCRX_REFILL: 830 - ret = io_zcrx_return_bufs(ctx, arg, nr_args); 831 - break; 832 829 default: 833 830 ret = -EINVAL; 834 831 break;

+18 -9

io_uring/rsrc.c

··· 945 945 struct req_iterator rq_iter; 946 946 struct io_mapped_ubuf *imu; 947 947 struct io_rsrc_node *node; 948 - struct bio_vec bv, *bvec; 949 - u16 nr_bvecs; 948 + struct bio_vec bv; 949 + unsigned int nr_bvecs = 0; 950 950 int ret = 0; 951 951 952 952 io_ring_submit_lock(ctx, issue_flags); ··· 967 967 goto unlock; 968 968 } 969 969 970 - nr_bvecs = blk_rq_nr_phys_segments(rq); 971 - imu = io_alloc_imu(ctx, nr_bvecs); 970 + /* 971 + * blk_rq_nr_phys_segments() may overestimate the number of bvecs 972 + * but avoids needing to iterate over the bvecs 973 + */ 974 + imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 972 975 if (!imu) { 973 976 kfree(node); 974 977 ret = -ENOMEM; ··· 982 979 imu->len = blk_rq_bytes(rq); 983 980 imu->acct_pages = 0; 984 981 imu->folio_shift = PAGE_SHIFT; 985 - imu->nr_bvecs = nr_bvecs; 986 982 refcount_set(&imu->refs, 1); 987 983 imu->release = release; 988 984 imu->priv = rq; 989 985 imu->is_kbuf = true; 990 986 imu->dir = 1 << rq_data_dir(rq); 991 987 992 - bvec = imu->bvec; 993 988 rq_for_each_bvec(bv, rq, rq_iter) 994 - *bvec++ = bv; 989 + imu->bvec[nr_bvecs++] = bv; 990 + imu->nr_bvecs = nr_bvecs; 995 991 996 992 node->buf = imu; 997 993 data->nodes[index] = node; ··· 1407 1405 size_t max_segs = 0; 1408 1406 unsigned i; 1409 1407 1410 - for (i = 0; i < nr_iovs; i++) 1408 + for (i = 0; i < nr_iovs; i++) { 1411 1409 max_segs += (iov[i].iov_len >> shift) + 2; 1410 + if (max_segs > INT_MAX) 1411 + return -EOVERFLOW; 1412 + } 1412 1413 return max_segs; 1413 1414 } 1414 1415 ··· 1517 1512 if (unlikely(ret)) 1518 1513 return ret; 1519 1514 } else { 1520 - nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1515 + int ret = io_estimate_bvec_size(iov, nr_iovs, imu); 1516 + 1517 + if (ret < 0) 1518 + return ret; 1519 + nr_segs = ret; 1521 1520 } 1522 1521 1523 1522 if (sizeof(struct bio_vec) > sizeof(struct iovec)) {

+3

io_uring/rw.c

··· 463 463 464 464 void io_readv_writev_cleanup(struct io_kiocb *req) 465 465 { 466 + struct io_async_rw *rw = req->async_data; 467 + 466 468 lockdep_assert_held(&req->ctx->uring_lock); 469 + io_vec_free(&rw->vec); 467 470 io_rw_recycle(req, 0); 468 471 } 469 472

+45 -20

io_uring/sqpoll.c

··· 11 11 #include <linux/audit.h> 12 12 #include <linux/security.h> 13 13 #include <linux/cpuset.h> 14 + #include <linux/sched/cputime.h> 14 15 #include <linux/io_uring.h> 15 16 16 17 #include <uapi/linux/io_uring.h> ··· 171 170 return READ_ONCE(sqd->state); 172 171 } 173 172 174 - static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 173 + struct io_sq_time { 174 + bool started; 175 + u64 usec; 176 + }; 177 + 178 + u64 io_sq_cpu_usec(struct task_struct *tsk) 179 + { 180 + u64 utime, stime; 181 + 182 + task_cputime_adjusted(tsk, &utime, &stime); 183 + do_div(stime, 1000); 184 + return stime; 185 + } 186 + 187 + static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist) 188 + { 189 + if (!ist->started) 190 + return; 191 + ist->started = false; 192 + sqd->work_time += io_sq_cpu_usec(current) - ist->usec; 193 + } 194 + 195 + static void io_sq_start_worktime(struct io_sq_time *ist) 196 + { 197 + if (ist->started) 198 + return; 199 + ist->started = true; 200 + ist->usec = io_sq_cpu_usec(current); 201 + } 202 + 203 + static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd, 204 + bool cap_entries, struct io_sq_time *ist) 175 205 { 176 206 unsigned int to_submit; 177 207 int ret = 0; ··· 214 182 215 183 if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { 216 184 const struct cred *creds = NULL; 185 + 186 + io_sq_start_worktime(ist); 217 187 218 188 if (ctx->sq_creds != current_cred()) 219 189 creds = override_creds(ctx->sq_creds); ··· 290 256 return retry_list || !llist_empty(&tctx->task_list); 291 257 } 292 258 293 - static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) 294 - { 295 - struct rusage end; 296 - 297 - getrusage(current, RUSAGE_SELF, &end); 298 - end.ru_stime.tv_sec -= start->ru_stime.tv_sec; 299 - end.ru_stime.tv_usec -= start->ru_stime.tv_usec; 300 - 301 - sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; 302 - } 303 - 304 259 static int io_sq_thread(void *data) 305 260 { 306 261 struct llist_node *retry_list = NULL; 307 262 struct io_sq_data *sqd = data; 308 263 struct io_ring_ctx *ctx; 309 - struct rusage start; 310 264 unsigned long timeout = 0; 311 265 char buf[TASK_COMM_LEN] = {}; 312 266 DEFINE_WAIT(wait); ··· 332 310 mutex_lock(&sqd->lock); 333 311 while (1) { 334 312 bool cap_entries, sqt_spin = false; 313 + struct io_sq_time ist = { }; 335 314 336 315 if (io_sqd_events_pending(sqd) || signal_pending(current)) { 337 316 if (io_sqd_handle_event(sqd)) ··· 341 318 } 342 319 343 320 cap_entries = !list_is_singular(&sqd->ctx_list); 344 - getrusage(current, RUSAGE_SELF, &start); 345 321 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 346 - int ret = __io_sq_thread(ctx, cap_entries); 322 + int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist); 347 323 348 324 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 349 325 sqt_spin = true; ··· 350 328 if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) 351 329 sqt_spin = true; 352 330 353 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 354 - if (io_napi(ctx)) 331 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 332 + if (io_napi(ctx)) { 333 + io_sq_start_worktime(&ist); 355 334 io_napi_sqpoll_busy_poll(ctx); 335 + } 336 + } 337 + 338 + io_sq_update_worktime(sqd, &ist); 356 339 357 340 if (sqt_spin || !time_after(jiffies, timeout)) { 358 - if (sqt_spin) { 359 - io_sq_update_worktime(sqd, &start); 341 + if (sqt_spin) 360 342 timeout = jiffies + sqd->sq_thread_idle; 361 - } 362 343 if (unlikely(need_resched())) { 363 344 mutex_unlock(&sqd->lock); 364 345 cond_resched();

+1

io_uring/sqpoll.h

··· 29 29 void io_put_sq_data(struct io_sq_data *sqd); 30 30 void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); 31 31 int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); 32 + u64 io_sq_cpu_usec(struct task_struct *tsk); 32 33 33 34 static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) 34 35 {

+1 -1

io_uring/waitid.c

··· 266 266 return -EINVAL; 267 267 268 268 iwa = io_uring_alloc_async_data(NULL, req); 269 - if (!unlikely(iwa)) 269 + if (unlikely(!iwa)) 270 270 return -ENOMEM; 271 271 iwa->req = req; 272 272

-68

io_uring/zcrx.c

··· 941 941 .uninstall = io_pp_uninstall, 942 942 }; 943 943 944 - #define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) 945 - #define IO_ZCRX_SYS_REFILL_BATCH 32 946 - 947 - static void io_return_buffers(struct io_zcrx_ifq *ifq, 948 - struct io_uring_zcrx_rqe *rqes, unsigned nr) 949 - { 950 - int i; 951 - 952 - for (i = 0; i < nr; i++) { 953 - struct net_iov *niov; 954 - netmem_ref netmem; 955 - 956 - if (!io_parse_rqe(&rqes[i], ifq, &niov)) 957 - continue; 958 - 959 - scoped_guard(spinlock_bh, &ifq->rq_lock) { 960 - if (!io_zcrx_put_niov_uref(niov)) 961 - continue; 962 - } 963 - 964 - netmem = net_iov_to_netmem(niov); 965 - if (!page_pool_unref_and_test(netmem)) 966 - continue; 967 - io_zcrx_return_niov(niov); 968 - } 969 - } 970 - 971 - int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 972 - void __user *arg, unsigned nr_arg) 973 - { 974 - struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; 975 - struct io_uring_zcrx_rqe __user *user_rqes; 976 - struct io_uring_zcrx_sync_refill zr; 977 - struct io_zcrx_ifq *ifq; 978 - unsigned nr, i; 979 - 980 - if (nr_arg) 981 - return -EINVAL; 982 - if (copy_from_user(&zr, arg, sizeof(zr))) 983 - return -EFAULT; 984 - if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) 985 - return -EINVAL; 986 - if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) 987 - return -EINVAL; 988 - 989 - ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); 990 - if (!ifq) 991 - return -EINVAL; 992 - nr = zr.nr_entries; 993 - user_rqes = u64_to_user_ptr(zr.rqes); 994 - 995 - for (i = 0; i < nr;) { 996 - unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); 997 - size_t size = batch * sizeof(rqes[0]); 998 - 999 - if (copy_from_user(rqes, user_rqes + i, size)) 1000 - return i ? i : -EFAULT; 1001 - io_return_buffers(ifq, rqes, batch); 1002 - 1003 - i += batch; 1004 - 1005 - if (fatal_signal_pending(current)) 1006 - return i; 1007 - cond_resched(); 1008 - } 1009 - return nr; 1010 - } 1011 - 1012 944 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 1013 945 struct io_zcrx_ifq *ifq, int off, int len) 1014 946 {

-7

io_uring/zcrx.h

··· 65 65 }; 66 66 67 67 #if defined(CONFIG_IO_URING_ZCRX) 68 - int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 69 - void __user *arg, unsigned nr_arg); 70 68 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 71 69 struct io_uring_zcrx_ifq_reg __user *arg); 72 70 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); ··· 92 94 unsigned int id) 93 95 { 94 96 return NULL; 95 - } 96 - static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 97 - void __user *arg, unsigned nr_arg) 98 - { 99 - return -EOPNOTSUPP; 100 97 } 101 98 #endif 102 99

Configure Feed

Configure Feed