Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring/kbuf: switch to storing struct io_buffer_list locally

Currently the buffer list is stored in struct io_kiocb. The buffer list
can be of two types:

1) Classic/legacy buffer list. These don't need to get referenced after
a buffer pick, and hence storing them in struct io_kiocb is perfectly
fine.

2) Ring provided buffer lists. These DO need to be referenced after the
initial buffer pick, as they need to get consumed later on. This can
be either just incrementing the head of the ring, or it can be
consuming parts of a buffer if incremental buffer consumptions has
been configured.

For case 2, io_uring needs to be careful not to access the buffer list
after the initial pick-and-execute context. The core does recycling of
these, but it's easy to make a mistake, because it's stored in the
io_kiocb which does persist across multiple execution contexts. Either
because it's a multishot request, or simply because it needed some kind
of async trigger (eg poll) for retry purposes.

Add a struct io_buffer_list to struct io_br_sel, which is always on
stack for the various users of it. This prevents the buffer list from
leaking outside of that execution context, and additionally it enables
kbuf to not even pass back the struct io_buffer_list if the given
context isn't appropriately locked already.

This doesn't fix any bugs, it's simply a defensive measure to prevent
any issues with reuse of a buffer list.

Link: https://lore.kernel.org/r/20250821020750.598432-12-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>

+55 -74
-6
include/linux/io_uring_types.h
··· 674 674 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 675 675 struct io_buffer *kbuf; 676 676 677 - /* 678 - * stores buffer ID for ring provided buffers, valid IFF 679 - * REQ_F_BUFFER_RING is set. 680 - */ 681 - struct io_buffer_list *buf_list; 682 - 683 677 struct io_rsrc_node *buf_node; 684 678 }; 685 679
+3 -3
io_uring/io_uring.c
··· 1007 1007 lockdep_assert_held(&req->ctx->uring_lock); 1008 1008 1009 1009 req_set_fail(req); 1010 - io_req_set_res(req, res, io_put_kbuf(req, res, req->buf_list)); 1010 + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); 1011 1011 if (def->fail) 1012 1012 def->fail(req); 1013 1013 io_req_complete_defer(req); ··· 2025 2025 2026 2026 switch (io_arm_poll_handler(req, 0)) { 2027 2027 case IO_APOLL_READY: 2028 - io_kbuf_recycle(req, req->buf_list, 0); 2028 + io_kbuf_recycle(req, NULL, 0); 2029 2029 io_req_task_queue(req); 2030 2030 break; 2031 2031 case IO_APOLL_ABORTED: 2032 - io_kbuf_recycle(req, req->buf_list, 0); 2032 + io_kbuf_recycle(req, NULL, 0); 2033 2033 io_queue_iowq(req); 2034 2034 break; 2035 2035 case IO_APOLL_OK:
+15 -12
io_uring/kbuf.c
··· 171 171 if (*len == 0 || *len > buf->len) 172 172 *len = buf->len; 173 173 req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; 174 - req->buf_list = bl; 175 174 req->buf_index = buf->bid; 175 + sel.buf_list = bl; 176 176 sel.addr = u64_to_user_ptr(buf->addr); 177 177 178 178 if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { ··· 186 186 * the transfer completes (or if we get -EAGAIN and must poll of 187 187 * retry). 188 188 */ 189 - io_kbuf_commit(req, bl, *len, 1); 190 - req->buf_list = NULL; 189 + io_kbuf_commit(req, sel.buf_list, *len, 1); 190 + sel.buf_list = NULL; 191 191 } 192 192 return sel; 193 193 } ··· 294 294 req->flags |= REQ_F_BL_EMPTY; 295 295 296 296 req->flags |= REQ_F_BUFFER_RING; 297 - req->buf_list = bl; 298 297 return iov - arg->iovs; 299 298 } 300 299 ··· 301 302 struct io_br_sel *sel, unsigned int issue_flags) 302 303 { 303 304 struct io_ring_ctx *ctx = req->ctx; 304 - struct io_buffer_list *bl; 305 305 int ret = -ENOENT; 306 306 307 307 io_ring_submit_lock(ctx, issue_flags); 308 - bl = io_buffer_get_list(ctx, arg->buf_group); 309 - if (unlikely(!bl)) 308 + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); 309 + if (unlikely(!sel->buf_list)) 310 310 goto out_unlock; 311 311 312 - if (bl->flags & IOBL_BUF_RING) { 313 - ret = io_ring_buffers_peek(req, arg, bl); 312 + if (sel->buf_list->flags & IOBL_BUF_RING) { 313 + ret = io_ring_buffers_peek(req, arg, sel->buf_list); 314 314 /* 315 315 * Don't recycle these buffers if we need to go through poll. 316 316 * Nobody else can use them anyway, and holding on to provided ··· 319 321 */ 320 322 if (ret > 0) { 321 323 req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; 322 - io_kbuf_commit(req, bl, arg->out_len, ret); 324 + io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); 323 325 } 324 326 } else { 325 - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); 327 + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); 326 328 } 327 329 out_unlock: 328 - io_ring_submit_unlock(ctx, issue_flags); 330 + if (issue_flags & IO_URING_F_UNLOCKED) { 331 + sel->buf_list = NULL; 332 + mutex_unlock(&ctx->uring_lock); 333 + } 329 334 return ret; 330 335 } 331 336 ··· 349 348 ret = io_ring_buffers_peek(req, arg, bl); 350 349 if (ret > 0) 351 350 req->flags |= REQ_F_BUFFERS_COMMIT; 351 + sel->buf_list = bl; 352 352 return ret; 353 353 } 354 354 355 355 /* don't support multiple buffer selections for legacy */ 356 + sel->buf_list = NULL; 356 357 return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); 357 358 } 358 359
+6 -10
io_uring/kbuf.h
··· 63 63 }; 64 64 65 65 /* 66 - * Return value from io_buffer_list selection. Just returns the error or 67 - * user address for now, will be extended to return the buffer list in the 68 - * future. 66 + * Return value from io_buffer_list selection, to avoid stashing it in 67 + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference 68 + * across execution contexts are fine. But for ring provided buffers, the 69 + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb 70 + * persists, it's better to just keep the buffer local for those cases. 69 71 */ 70 72 struct io_br_sel { 73 + struct io_buffer_list *buf_list; 71 74 /* 72 75 * Some selection parts return the user address, others return an error. 73 76 */ ··· 110 107 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, 111 108 struct io_buffer_list *bl) 112 109 { 113 - /* 114 - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear 115 - * the flag and hence ensure that bl->head doesn't get incremented. 116 - * If the tail has already been incremented, hang on to it. 117 - * The exception is partial io, that case we should increment bl->head 118 - * to monopolize the buffer. 119 - */ 120 110 if (bl) { 121 111 req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); 122 112 return true;
+17 -29
io_uring/net.c
··· 433 433 if (req->opcode == IORING_OP_SENDMSG) 434 434 return -EINVAL; 435 435 sr->msg_flags |= MSG_WAITALL; 436 - req->buf_list = NULL; 437 436 req->flags |= REQ_F_MULTISHOT; 438 437 } 439 438 ··· 511 512 unsigned int cflags; 512 513 513 514 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 514 - cflags = io_put_kbuf(req, sel->val, req->buf_list); 515 + cflags = io_put_kbuf(req, sel->val, sel->buf_list); 515 516 goto finish; 516 517 } 517 518 518 - cflags = io_put_kbufs(req, sel->val, req->buf_list, io_bundle_nbufs(kmsg, sel->val)); 519 + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); 519 520 520 521 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 521 522 goto finish; ··· 656 657 flags |= MSG_DONTWAIT; 657 658 658 659 retry_bundle: 660 + sel.buf_list = NULL; 659 661 if (io_do_buffer_select(req)) { 660 662 ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); 661 663 if (ret) ··· 682 682 sr->len -= ret; 683 683 sr->buf += ret; 684 684 sr->done_io += ret; 685 - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); 685 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 686 686 } 687 687 if (ret == -ERESTARTSYS) 688 688 ret = -EINTR; ··· 795 795 req->flags |= REQ_F_NOWAIT; 796 796 if (sr->msg_flags & MSG_ERRQUEUE) 797 797 req->flags |= REQ_F_CLEAR_POLLIN; 798 - if (req->flags & REQ_F_BUFFER_SELECT) { 799 - /* 800 - * Store the buffer group for this multishot receive separately, 801 - * as if we end up doing an io-wq based issue that selects a 802 - * buffer, it has to be committed immediately and that will 803 - * clear ->buf_list. This means we lose the link to the buffer 804 - * list, and the eventual buffer put on completion then cannot 805 - * restore it. 806 - */ 798 + if (req->flags & REQ_F_BUFFER_SELECT) 807 799 sr->buf_group = req->buf_index; 808 - req->buf_list = NULL; 809 - } 810 800 sr->mshot_total_len = sr->mshot_len = 0; 811 801 if (sr->flags & IORING_RECV_MULTISHOT) { 812 802 if (!(req->flags & REQ_F_BUFFER_SELECT)) ··· 864 874 if (sr->flags & IORING_RECVSEND_BUNDLE) { 865 875 size_t this_ret = sel->val - sr->done_io; 866 876 867 - cflags |= io_put_kbufs(req, this_ret, req->buf_list, io_bundle_nbufs(kmsg, this_ret)); 877 + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); 868 878 if (sr->flags & IORING_RECV_RETRY) 869 879 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 870 880 if (sr->mshot_len && sel->val >= sr->mshot_len) ··· 886 896 return false; 887 897 } 888 898 } else { 889 - cflags |= io_put_kbuf(req, sel->val, req->buf_list); 899 + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); 890 900 } 891 901 892 902 /* ··· 1028 1038 flags |= MSG_DONTWAIT; 1029 1039 1030 1040 retry_multishot: 1041 + sel.buf_list = NULL; 1031 1042 if (io_do_buffer_select(req)) { 1032 1043 size_t len = sr->len; 1033 1044 ··· 1039 1048 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1040 1049 ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); 1041 1050 if (ret) { 1042 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1051 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1043 1052 return ret; 1044 1053 } 1045 1054 } ··· 1063 1072 1064 1073 if (ret < min_ret) { 1065 1074 if (ret == -EAGAIN && force_nonblock) { 1066 - if (issue_flags & IO_URING_F_MULTISHOT) 1067 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1068 - 1075 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1069 1076 return IOU_RETRY; 1070 1077 } 1071 1078 if (ret > 0 && io_net_retry(sock, flags)) { 1072 1079 sr->done_io += ret; 1073 - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); 1080 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 1074 1081 } 1075 1082 if (ret == -ERESTARTSYS) 1076 1083 ret = -EINTR; ··· 1082 1093 else if (sr->done_io) 1083 1094 ret = sr->done_io; 1084 1095 else 1085 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1096 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1086 1097 1087 1098 sel.val = ret; 1088 1099 if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) ··· 1167 1178 { 1168 1179 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1169 1180 struct io_async_msghdr *kmsg = req->async_data; 1170 - struct io_br_sel sel = { }; 1181 + struct io_br_sel sel; 1171 1182 struct socket *sock; 1172 1183 unsigned flags; 1173 1184 int ret, min_ret = 0; ··· 1187 1198 flags |= MSG_DONTWAIT; 1188 1199 1189 1200 retry_multishot: 1201 + sel.buf_list = NULL; 1190 1202 if (io_do_buffer_select(req)) { 1191 1203 sel.val = sr->len; 1192 1204 ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); ··· 1207 1217 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1208 1218 if (ret < min_ret) { 1209 1219 if (ret == -EAGAIN && force_nonblock) { 1210 - if (issue_flags & IO_URING_F_MULTISHOT) 1211 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1212 - 1220 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1213 1221 return IOU_RETRY; 1214 1222 } 1215 1223 if (ret > 0 && io_net_retry(sock, flags)) { 1216 1224 sr->len -= ret; 1217 1225 sr->buf += ret; 1218 1226 sr->done_io += ret; 1219 - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); 1227 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 1220 1228 } 1221 1229 if (ret == -ERESTARTSYS) 1222 1230 ret = -EINTR; ··· 1230 1242 else if (sr->done_io) 1231 1243 ret = sr->done_io; 1232 1244 else 1233 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1245 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1234 1246 1235 1247 sel.val = ret; 1236 1248 if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
+3 -3
io_uring/poll.c
··· 316 316 317 317 ret = io_poll_check_events(req, tw); 318 318 if (ret == IOU_POLL_NO_ACTION) { 319 - io_kbuf_recycle(req, req->buf_list, 0); 319 + io_kbuf_recycle(req, NULL, 0); 320 320 return; 321 321 } else if (ret == IOU_POLL_REQUEUE) { 322 - io_kbuf_recycle(req, req->buf_list, 0); 322 + io_kbuf_recycle(req, NULL, 0); 323 323 __io_poll_execute(req, 0); 324 324 return; 325 325 } ··· 686 686 req->flags |= REQ_F_POLLED; 687 687 ipt.pt._qproc = io_async_queue_proc; 688 688 689 - io_kbuf_recycle(req, req->buf_list, issue_flags); 689 + io_kbuf_recycle(req, NULL, issue_flags); 690 690 691 691 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); 692 692 if (ret)
+11 -11
io_uring/rw.c
··· 579 579 io_req_io_end(req); 580 580 581 581 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 582 - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, req->buf_list); 582 + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 583 583 584 584 io_req_rw_cleanup(req, 0); 585 585 io_req_task_complete(req, tw); ··· 648 648 } 649 649 650 650 static int kiocb_done(struct io_kiocb *req, ssize_t ret, 651 - unsigned int issue_flags) 651 + struct io_br_sel *sel, unsigned int issue_flags) 652 652 { 653 653 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 654 654 unsigned final_ret = io_fixup_rw_res(req, ret); ··· 662 662 * from the submission path. 663 663 */ 664 664 io_req_io_end(req); 665 - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, req->buf_list)); 665 + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list)); 666 666 io_req_rw_cleanup(req, issue_flags); 667 667 return IOU_COMPLETE; 668 668 } else { ··· 1024 1024 1025 1025 ret = __io_read(req, &sel, issue_flags); 1026 1026 if (ret >= 0) 1027 - return kiocb_done(req, ret, issue_flags); 1027 + return kiocb_done(req, ret, &sel, issue_flags); 1028 1028 1029 1029 if (req->flags & REQ_F_BUFFERS_COMMIT) 1030 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1030 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1031 1031 return ret; 1032 1032 } 1033 1033 ··· 1057 1057 * Reset rw->len to 0 again to avoid clamping future mshot 1058 1058 * reads, in case the buffer size varies. 1059 1059 */ 1060 - if (io_kbuf_recycle(req, req->buf_list, issue_flags)) 1060 + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) 1061 1061 rw->len = 0; 1062 1062 return IOU_RETRY; 1063 1063 } else if (ret <= 0) { 1064 - io_kbuf_recycle(req, req->buf_list, issue_flags); 1064 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1065 1065 if (ret < 0) 1066 1066 req_set_fail(req); 1067 1067 } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1068 - cflags = io_put_kbuf(req, ret, req->buf_list); 1068 + cflags = io_put_kbuf(req, ret, sel.buf_list); 1069 1069 } else { 1070 1070 /* 1071 1071 * Any successful return value will keep the multishot read ··· 1073 1073 * we fail to post a CQE, or multishot is no longer set, then 1074 1074 * jump to the termination path. This request is then done. 1075 1075 */ 1076 - cflags = io_put_kbuf(req, ret, req->buf_list); 1076 + cflags = io_put_kbuf(req, ret, sel.buf_list); 1077 1077 rw->len = 0; /* similarly to above, reset len to 0 */ 1078 1078 1079 1079 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { ··· 1202 1202 return -EAGAIN; 1203 1203 } 1204 1204 done: 1205 - return kiocb_done(req, ret2, issue_flags); 1205 + return kiocb_done(req, ret2, NULL, issue_flags); 1206 1206 } else { 1207 1207 ret_eagain: 1208 1208 iov_iter_restore(&io->iter, &io->iter_state); ··· 1370 1370 if (!smp_load_acquire(&req->iopoll_completed)) 1371 1371 break; 1372 1372 nr_events++; 1373 - req->cqe.flags = io_put_kbuf(req, req->cqe.res, req->buf_list); 1373 + req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); 1374 1374 if (req->opcode != IORING_OP_URING_CMD) 1375 1375 io_req_rw_cleanup(req, 0); 1376 1376 }