Merge tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:
"Followup set of fixes for io_uring for this merge window. These are
either later fixes, or cleanups that don't make sense to defer. This
pull request contains:

- Fix for a recent regression in io-wq worker creation

- Tracing cleanup

- Use READ_ONCE/WRITE_ONCE consistently for ring mapped kbufs. Mostly
for documentation purposes, indicating that they are shared with
userspace

- Fix for POLL_ADD losing a completion, if the request is updated and
now is triggerable - eg, if POLLIN is set with the updated, and the
polled file is readable

- In conjunction with the above fix, also unify how poll wait queue
entries are deleted with the head update. We had 3 different spots
doing both the list deletion and head write, with one of them
nicely documented. Abstract that into a helper and use it
consistently

- Small series from Joanne fixing an issue with buffer cloning, and
cleaning up the arg validation"

* tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
io_uring/poll: unify poll waitqueue entry and list removal
io_uring/kbuf: use WRITE_ONCE() for userspace-shared buffer ring fields
io_uring/kbuf: use READ_ONCE() for userspace-mapped memory
io_uring/rsrc: fix lost entries after cloned range
io_uring/rsrc: rename misleading src_node variable in io_clone_buffers()
io_uring/rsrc: clean up buffer cloning arg validation
io_uring/trace: rename io_uring_queue_async_work event "rw" field
io_uring/io-wq: always retry worker create on ERESTART*
io_uring/poll: correctly handle io_poll_add() return value on update

Linus Torvalds 5 months ago cfd40392 4482ebb2

+67 -65

5 changed files

expand all

include

trace

events

io_uring.h

io_uring

io-wq.c

kbuf.c

poll.c

rsrc.c

+6 -6

include/trace/events/io_uring.h

··· 133 133 * io_uring_queue_async_work - called before submitting a new async work 134 134 * 135 135 * @req: pointer to a submitted request 136 - * @rw: type of workqueue, hashed or normal 136 + * @hashed: whether async work is hashed 137 137 * 138 138 * Allows to trace asynchronous work submission. 139 139 */ 140 140 TRACE_EVENT(io_uring_queue_async_work, 141 141 142 - TP_PROTO(struct io_kiocb *req, int rw), 142 + TP_PROTO(struct io_kiocb *req, bool hashed), 143 143 144 - TP_ARGS(req, rw), 144 + TP_ARGS(req, hashed), 145 145 146 146 TP_STRUCT__entry ( 147 147 __field( void *, ctx ) ··· 150 150 __field( u8, opcode ) 151 151 __field( unsigned long long, flags ) 152 152 __field( struct io_wq_work *, work ) 153 - __field( int, rw ) 153 + __field( bool, hashed ) 154 154 155 155 __string( op_str, io_uring_get_opcode(req->opcode) ) 156 156 ), ··· 162 162 __entry->flags = (__force unsigned long long) req->flags; 163 163 __entry->opcode = req->opcode; 164 164 __entry->work = &req->work; 165 - __entry->rw = rw; 165 + __entry->hashed = hashed; 166 166 167 167 __assign_str(op_str); 168 168 ), ··· 170 170 TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", 171 171 __entry->ctx, __entry->req, __entry->user_data, 172 172 __get_str(op_str), __entry->flags, 173 - __entry->rw ? "hashed" : "normal", __entry->work) 173 + __entry->hashed ? "hashed" : "normal", __entry->work) 174 174 ); 175 175 176 176 /**

+3 -2

io_uring/io-wq.c

··· 805 805 */ 806 806 if (fatal_signal_pending(current)) 807 807 return false; 808 - if (worker->init_retries++ >= WORKER_INIT_LIMIT) 809 - return false; 810 808 809 + worker->init_retries++; 811 810 switch (err) { 812 811 case -EAGAIN: 812 + return worker->init_retries <= WORKER_INIT_LIMIT; 813 + /* Analogous to a fork() syscall, always retry on a restartable error */ 813 814 case -ERESTARTSYS: 814 815 case -ERESTARTNOINTR: 815 816 case -ERESTARTNOHAND:

+8 -8

io_uring/kbuf.c

··· 44 44 buf_len -= this_len; 45 45 /* Stop looping for invalid buffer length of 0 */ 46 46 if (buf_len || !this_len) { 47 - buf->addr += this_len; 48 - buf->len = buf_len; 47 + WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); 48 + WRITE_ONCE(buf->len, buf_len); 49 49 return false; 50 50 } 51 - buf->len = 0; 51 + WRITE_ONCE(buf->len, 0); 52 52 bl->head++; 53 53 len -= this_len; 54 54 } ··· 198 198 if (*len == 0 || *len > buf_len) 199 199 *len = buf_len; 200 200 req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; 201 - req->buf_index = buf->bid; 201 + req->buf_index = READ_ONCE(buf->bid); 202 202 sel.buf_list = bl; 203 - sel.addr = u64_to_user_ptr(buf->addr); 203 + sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr)); 204 204 205 205 if (io_should_commit(req, issue_flags)) { 206 206 io_kbuf_commit(req, sel.buf_list, *len, 1); ··· 280 280 if (!arg->max_len) 281 281 arg->max_len = INT_MAX; 282 282 283 - req->buf_index = buf->bid; 283 + req->buf_index = READ_ONCE(buf->bid); 284 284 do { 285 285 u32 len = READ_ONCE(buf->len); 286 286 ··· 291 291 arg->partial_map = 1; 292 292 if (iov != arg->iovs) 293 293 break; 294 - buf->len = len; 294 + WRITE_ONCE(buf->len, len); 295 295 } 296 296 } 297 297 298 - iov->iov_base = u64_to_user_ptr(buf->addr); 298 + iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr)); 299 299 iov->iov_len = len; 300 300 iov++; 301 301

+29 -23

io_uring/poll.c

··· 138 138 init_waitqueue_func_entry(&poll->wait, io_poll_wake); 139 139 } 140 140 141 + static void io_poll_remove_waitq(struct io_poll *poll) 142 + { 143 + /* 144 + * If the waitqueue is being freed early but someone is already holds 145 + * ownership over it, we have to tear down the request as best we can. 146 + * That means immediately removing the request from its waitqueue and 147 + * preventing all further accesses to the waitqueue via the request. 148 + */ 149 + list_del_init(&poll->wait.entry); 150 + 151 + /* 152 + * Careful: this *must* be the last step, since as soon as req->head is 153 + * NULL'ed out, the request can be completed and freed, since 154 + * io_poll_remove_entry() will no longer need to take the waitqueue 155 + * lock. 156 + */ 157 + smp_store_release(&poll->head, NULL); 158 + } 159 + 141 160 static inline void io_poll_remove_entry(struct io_poll *poll) 142 161 { 143 162 struct wait_queue_head *head = smp_load_acquire(&poll->head); 144 163 145 164 if (head) { 146 165 spin_lock_irq(&head->lock); 147 - list_del_init(&poll->wait.entry); 148 - poll->head = NULL; 166 + io_poll_remove_waitq(poll); 149 167 spin_unlock_irq(&head->lock); 150 168 } 151 169 } ··· 386 368 io_poll_mark_cancelled(req); 387 369 /* we have to kick tw in case it's not already */ 388 370 io_poll_execute(req, 0); 389 - 390 - /* 391 - * If the waitqueue is being freed early but someone is already 392 - * holds ownership over it, we have to tear down the request as 393 - * best we can. That means immediately removing the request from 394 - * its waitqueue and preventing all further accesses to the 395 - * waitqueue via the request. 396 - */ 397 - list_del_init(&poll->wait.entry); 398 - 399 - /* 400 - * Careful: this *must* be the last step, since as soon 401 - * as req->head is NULL'ed out, the request can be 402 - * completed and freed, since aio_poll_complete_work() 403 - * will no longer need to take the waitqueue lock. 404 - */ 405 - smp_store_release(&poll->head, NULL); 371 + io_poll_remove_waitq(poll); 406 372 return 1; 407 373 } 408 374 ··· 415 413 416 414 /* optional, saves extra locking for removal in tw handler */ 417 415 if (mask && poll->events & EPOLLONESHOT) { 418 - list_del_init(&poll->wait.entry); 419 - poll->head = NULL; 416 + io_poll_remove_waitq(poll); 420 417 if (wqe_is_double(wait)) 421 418 req->flags &= ~REQ_F_DOUBLE_POLL; 422 419 else ··· 938 937 939 938 ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); 940 939 /* successfully updated, don't complete poll request */ 941 - if (!ret2 || ret2 == -EIOCBQUEUED) 940 + if (ret2 == IOU_ISSUE_SKIP_COMPLETE) 942 941 goto out; 942 + /* request completed as part of the update, complete it */ 943 + else if (ret2 == IOU_COMPLETE) 944 + goto complete; 943 945 } 944 946 945 - req_set_fail(preq); 946 947 io_req_set_res(preq, -ECANCELED, 0); 948 + complete: 949 + if (preq->cqe.res < 0) 950 + req_set_fail(preq); 947 951 preq->io_task_work.func = io_req_task_complete; 948 952 io_req_task_work_add(preq); 949 953 out:

+21 -26

io_uring/rsrc.c

··· 1186 1186 return -EBUSY; 1187 1187 1188 1188 nbufs = src_ctx->buf_table.nr; 1189 + if (!nbufs) 1190 + return -ENXIO; 1189 1191 if (!arg->nr) 1190 1192 arg->nr = nbufs; 1191 1193 else if (arg->nr > nbufs) 1192 1194 return -EINVAL; 1193 1195 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1194 1196 return -EINVAL; 1197 + if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs) 1198 + return -EOVERFLOW; 1195 1199 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1196 1200 return -EOVERFLOW; 1197 1201 if (nbufs > IORING_MAX_REG_BUFFERS) ··· 1205 1201 if (ret) 1206 1202 return ret; 1207 1203 1208 - /* Fill entries in data from dst that won't overlap with src */ 1204 + /* Copy original dst nodes from before the cloned range */ 1209 1205 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1210 - struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1206 + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1211 1207 1212 - if (src_node) { 1213 - data.nodes[i] = src_node; 1214 - src_node->refs++; 1208 + if (node) { 1209 + data.nodes[i] = node; 1210 + node->refs++; 1215 1211 } 1216 1212 } 1217 - 1218 - ret = -ENXIO; 1219 - nbufs = src_ctx->buf_table.nr; 1220 - if (!nbufs) 1221 - goto out_free; 1222 - ret = -EINVAL; 1223 - if (!arg->nr) 1224 - arg->nr = nbufs; 1225 - else if (arg->nr > nbufs) 1226 - goto out_free; 1227 - ret = -EOVERFLOW; 1228 - if (check_add_overflow(arg->nr, arg->src_off, &off)) 1229 - goto out_free; 1230 - if (off > nbufs) 1231 - goto out_free; 1232 1213 1233 1214 off = arg->dst_off; 1234 1215 i = arg->src_off; ··· 1227 1238 } else { 1228 1239 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1229 1240 if (!dst_node) { 1230 - ret = -ENOMEM; 1231 - goto out_free; 1241 + io_rsrc_data_free(ctx, &data); 1242 + return -ENOMEM; 1232 1243 } 1233 1244 1234 1245 refcount_inc(&src_node->buf->refs); ··· 1236 1247 } 1237 1248 data.nodes[off++] = dst_node; 1238 1249 i++; 1250 + } 1251 + 1252 + /* Copy original dst nodes from after the cloned range */ 1253 + for (i = nbufs; i < ctx->buf_table.nr; i++) { 1254 + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 1255 + 1256 + if (node) { 1257 + data.nodes[i] = node; 1258 + node->refs++; 1259 + } 1239 1260 } 1240 1261 1241 1262 /* ··· 1264 1265 WARN_ON_ONCE(ctx->buf_table.nr); 1265 1266 ctx->buf_table = data; 1266 1267 return 0; 1267 - 1268 - out_free: 1269 - io_rsrc_data_free(ctx, &data); 1270 - return ret; 1271 1268 } 1272 1269 1273 1270 /*

Configure Feed

Configure Feed