Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-6.3-2023-03-03' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:
"Here's a set of fixes/changes that didn't make the first cut, either
because they got queued before I sent the early merge request, or
fixes that came in afterwards. In detail:

- Don't set MSG_NOSIGNAL on recv/recvmsg opcodes, as AF_PACKET will
error out (David)

- Fix for spurious poll wakeups (me)

- Fix for a file leak for buffered reads in certain conditions
(Joseph)

- Don't allow registered buffers of mixed types (Pavel)

- Improve handling of huge pages for registered buffers (Pavel)

- Provided buffer ring size calculation fix (Wojciech)

- Minor cleanups (me)"

* tag 'io_uring-6.3-2023-03-03' of git://git.kernel.dk/linux:
io_uring/poll: don't pass in wake func to io_init_poll_iocb()
io_uring: fix fget leak when fs don't support nowait buffered read
io_uring/poll: allow some retries for poll triggering spuriously
io_uring: remove MSG_NOSIGNAL from recvmsg
io_uring/rsrc: always initialize 'folio' to NULL
io_uring/rsrc: optimise registered huge pages
io_uring/rsrc: optimise single entry advance
io_uring/rsrc: disallow multi-source reg buffers
io_uring: remove unused wq_list_merge
io_uring: fix size calculation when registering buf ring
io_uring/rsrc: fix a comment in io_import_fixed()
io_uring: rename 'in_idle' to 'in_cancel'
io_uring: consolidate the put_ref-and-return section of adding work

+85 -62
+1 -1
include/linux/io_uring_types.h
··· 58 58 59 59 struct xarray xa; 60 60 struct wait_queue_head wait; 61 - atomic_t in_idle; 61 + atomic_t in_cancel; 62 62 atomic_t inflight_tracked; 63 63 struct percpu_counter inflight; 64 64
+16 -16
io_uring/io_uring.c
··· 719 719 struct io_uring_task *tctx = task->io_uring; 720 720 721 721 percpu_counter_sub(&tctx->inflight, nr); 722 - if (unlikely(atomic_read(&tctx->in_idle))) 722 + if (unlikely(atomic_read(&tctx->in_cancel))) 723 723 wake_up(&tctx->wait); 724 724 put_task_struct_many(task, nr); 725 725 } ··· 1258 1258 1259 1259 ctx_flush_and_put(ctx, &uring_locked); 1260 1260 1261 - /* relaxed read is enough as only the task itself sets ->in_idle */ 1262 - if (unlikely(atomic_read(&tctx->in_idle))) 1261 + /* relaxed read is enough as only the task itself sets ->in_cancel */ 1262 + if (unlikely(atomic_read(&tctx->in_cancel))) 1263 1263 io_uring_drop_tctx_refs(current); 1264 1264 1265 1265 trace_io_uring_task_work_run(tctx, count, loops); ··· 1285 1285 1286 1286 percpu_ref_get(&ctx->refs); 1287 1287 1288 - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { 1289 - percpu_ref_put(&ctx->refs); 1290 - return; 1291 - } 1288 + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) 1289 + goto put_ref; 1290 + 1292 1291 /* needed for the following wake up */ 1293 1292 smp_mb__after_atomic(); 1294 1293 1295 - if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { 1294 + if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { 1296 1295 io_move_task_work_from_local(ctx); 1297 - percpu_ref_put(&ctx->refs); 1298 - return; 1296 + goto put_ref; 1299 1297 } 1300 1298 1301 1299 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) ··· 1303 1305 1304 1306 if (READ_ONCE(ctx->cq_waiting)) 1305 1307 wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 1308 + 1309 + put_ref: 1306 1310 percpu_ref_put(&ctx->refs); 1307 1311 } 1308 1312 ··· 1777 1777 const struct io_issue_def *def = &io_issue_defs[req->opcode]; 1778 1778 1779 1779 /* assign early for deferred execution for non-fixed file */ 1780 - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) 1780 + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) 1781 1781 req->file = io_file_get_normal(req, req->cqe.fd); 1782 1782 if (!cdef->prep_async) 1783 1783 return 0; ··· 2937 2937 2938 2938 work = container_of(cb, struct io_tctx_exit, task_work); 2939 2939 /* 2940 - * When @in_idle, we're in cancellation and it's racy to remove the 2940 + * When @in_cancel, we're in cancellation and it's racy to remove the 2941 2941 * node. It'll be removed by the end of cancellation, just ignore it. 2942 2942 * tctx can be NULL if the queueing of this task_work raced with 2943 2943 * work cancelation off the exec path. 2944 2944 */ 2945 - if (tctx && !atomic_read(&tctx->in_idle)) 2945 + if (tctx && !atomic_read(&tctx->in_cancel)) 2946 2946 io_uring_del_tctx_node((unsigned long)work->ctx); 2947 2947 complete(&work->completion); 2948 2948 } ··· 3210 3210 if (tctx->io_wq) 3211 3211 io_wq_exit_start(tctx->io_wq); 3212 3212 3213 - atomic_inc(&tctx->in_idle); 3213 + atomic_inc(&tctx->in_cancel); 3214 3214 do { 3215 3215 bool loop = false; 3216 3216 ··· 3261 3261 if (cancel_all) { 3262 3262 /* 3263 3263 * We shouldn't run task_works after cancel, so just leave 3264 - * ->in_idle set for normal exit. 3264 + * ->in_cancel set for normal exit. 3265 3265 */ 3266 - atomic_dec(&tctx->in_idle); 3266 + atomic_dec(&tctx->in_cancel); 3267 3267 /* for exec all current's requests should be gone, kill tctx */ 3268 3268 __io_uring_free(current); 3269 3269 }
+1 -1
io_uring/kbuf.c
··· 505 505 } 506 506 507 507 pages = io_pin_pages(reg.ring_addr, 508 - struct_size(br, bufs, reg.ring_entries), 508 + flex_array_size(br, bufs, reg.ring_entries), 509 509 &nr_pages); 510 510 if (IS_ERR(pages)) { 511 511 kfree(free_bl);
+1 -1
io_uring/net.c
··· 567 567 sr->flags = READ_ONCE(sqe->ioprio); 568 568 if (sr->flags & ~(RECVMSG_FLAGS)) 569 569 return -EINVAL; 570 - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 570 + sr->msg_flags = READ_ONCE(sqe->msg_flags); 571 571 if (sr->msg_flags & MSG_DONTWAIT) 572 572 req->flags |= REQ_F_NOWAIT; 573 573 if (sr->msg_flags & MSG_ERRQUEUE)
+19 -7
io_uring/poll.c
··· 51 51 52 52 #define IO_WQE_F_DOUBLE 1 53 53 54 + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 55 + void *key); 56 + 54 57 static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) 55 58 { 56 59 unsigned long priv = (unsigned long)wqe->private; ··· 167 164 } 168 165 } 169 166 170 - static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, 171 - wait_queue_func_t wake_func) 167 + static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) 172 168 { 173 169 poll->head = NULL; 174 170 #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 175 171 /* mask in events that we always want/need */ 176 172 poll->events = events | IO_POLL_UNMASK; 177 173 INIT_LIST_HEAD(&poll->wait.entry); 178 - init_waitqueue_func_entry(&poll->wait, wake_func); 174 + init_waitqueue_func_entry(&poll->wait, io_poll_wake); 179 175 } 180 176 181 177 static inline void io_poll_remove_entry(struct io_poll *poll) ··· 510 508 511 509 /* mark as double wq entry */ 512 510 wqe_private |= IO_WQE_F_DOUBLE; 513 - io_init_poll_iocb(poll, first->events, first->wait.func); 511 + io_init_poll_iocb(poll, first->events); 514 512 if (!io_poll_double_prepare(req)) { 515 513 /* the request is completing, just back off */ 516 514 kfree(poll); ··· 571 569 572 570 INIT_HLIST_NODE(&req->hash_node); 573 571 req->work.cancel_seq = atomic_read(&ctx->cancel_seq); 574 - io_init_poll_iocb(poll, mask, io_poll_wake); 572 + io_init_poll_iocb(poll, mask); 575 573 poll->file = req->file; 576 574 req->apoll_events = poll->events; 577 575 ··· 652 650 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 653 651 } 654 652 653 + /* 654 + * We can't reliably detect loops in repeated poll triggers and issue 655 + * subsequently failing. But rather than fail these immediately, allow a 656 + * certain amount of retries before we give up. Given that this condition 657 + * should _rarely_ trigger even once, we should be fine with a larger value. 658 + */ 659 + #define APOLL_MAX_RETRY 128 660 + 655 661 static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, 656 662 unsigned issue_flags) 657 663 { ··· 675 665 if (entry == NULL) 676 666 goto alloc_apoll; 677 667 apoll = container_of(entry, struct async_poll, cache); 668 + apoll->poll.retries = APOLL_MAX_RETRY; 678 669 } else { 679 670 alloc_apoll: 680 671 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 681 672 if (unlikely(!apoll)) 682 673 return NULL; 674 + apoll->poll.retries = APOLL_MAX_RETRY; 683 675 } 684 676 apoll->double_poll = NULL; 685 677 req->apoll = apoll; 678 + if (unlikely(!--apoll->poll.retries)) 679 + return NULL; 686 680 return apoll; 687 681 } 688 682 ··· 707 693 if (!def->pollin && !def->pollout) 708 694 return IO_APOLL_ABORTED; 709 695 if (!file_can_poll(req->file)) 710 - return IO_APOLL_ABORTED; 711 - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) 712 696 return IO_APOLL_ABORTED; 713 697 if (!(req->flags & REQ_F_APOLL_MULTISHOT)) 714 698 mask |= EPOLLONESHOT;
+1
io_uring/poll.h
··· 12 12 struct file *file; 13 13 struct wait_queue_head *head; 14 14 __poll_t events; 15 + int retries; 15 16 struct wait_queue_entry wait; 16 17 }; 17 18
+45 -13
io_uring/rsrc.c
··· 1162 1162 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 1163 1163 pages, vmas); 1164 1164 if (pret == nr_pages) { 1165 + struct file *file = vmas[0]->vm_file; 1166 + 1165 1167 /* don't support file backed memory */ 1166 1168 for (i = 0; i < nr_pages; i++) { 1167 - struct vm_area_struct *vma = vmas[i]; 1168 - 1169 - if (vma_is_shmem(vma)) 1169 + if (vmas[i]->vm_file != file) { 1170 + ret = -EINVAL; 1171 + break; 1172 + } 1173 + if (!file) 1170 1174 continue; 1171 - if (vma->vm_file && 1172 - !is_file_hugepages(vma->vm_file)) { 1175 + if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 1173 1176 ret = -EOPNOTSUPP; 1174 1177 break; 1175 1178 } ··· 1210 1207 unsigned long off; 1211 1208 size_t size; 1212 1209 int ret, nr_pages, i; 1210 + struct folio *folio = NULL; 1213 1211 1214 1212 *pimu = ctx->dummy_ubuf; 1215 1213 if (!iov->iov_base) ··· 1225 1221 goto done; 1226 1222 } 1227 1223 1224 + /* If it's a huge page, try to coalesce them into a single bvec entry */ 1225 + if (nr_pages > 1) { 1226 + folio = page_folio(pages[0]); 1227 + for (i = 1; i < nr_pages; i++) { 1228 + if (page_folio(pages[i]) != folio) { 1229 + folio = NULL; 1230 + break; 1231 + } 1232 + } 1233 + if (folio) { 1234 + folio_put_refs(folio, nr_pages - 1); 1235 + nr_pages = 1; 1236 + } 1237 + } 1238 + 1228 1239 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 1229 1240 if (!imu) 1230 1241 goto done; ··· 1252 1233 1253 1234 off = (unsigned long) iov->iov_base & ~PAGE_MASK; 1254 1235 size = iov->iov_len; 1236 + /* store original address for later verification */ 1237 + imu->ubuf = (unsigned long) iov->iov_base; 1238 + imu->ubuf_end = imu->ubuf + iov->iov_len; 1239 + imu->nr_bvecs = nr_pages; 1240 + *pimu = imu; 1241 + ret = 0; 1242 + 1243 + if (folio) { 1244 + bvec_set_page(&imu->bvec[0], pages[0], size, off); 1245 + goto done; 1246 + } 1255 1247 for (i = 0; i < nr_pages; i++) { 1256 1248 size_t vec_len; 1257 1249 ··· 1271 1241 off = 0; 1272 1242 size -= vec_len; 1273 1243 } 1274 - /* store original address for later verification */ 1275 - imu->ubuf = (unsigned long) iov->iov_base; 1276 - imu->ubuf_end = imu->ubuf + iov->iov_len; 1277 - imu->nr_bvecs = nr_pages; 1278 - *pimu = imu; 1279 - ret = 0; 1280 1244 done: 1281 1245 if (ret) 1282 1246 kvfree(imu); ··· 1359 1335 return -EFAULT; 1360 1336 1361 1337 /* 1362 - * May not be a start of buffer, set size appropriately 1338 + * Might not be a start of buffer, set size appropriately 1363 1339 * and advance us to the beginning. 1364 1340 */ 1365 1341 offset = buf_addr - imu->ubuf; ··· 1385 1361 const struct bio_vec *bvec = imu->bvec; 1386 1362 1387 1363 if (offset <= bvec->bv_len) { 1388 - iov_iter_advance(iter, offset); 1364 + /* 1365 + * Note, huge pages buffers consists of one large 1366 + * bvec entry and should always go this way. The other 1367 + * branch doesn't expect non PAGE_SIZE'd chunks. 1368 + */ 1369 + iter->bvec = bvec; 1370 + iter->nr_segs = bvec->bv_len; 1371 + iter->count -= offset; 1372 + iter->iov_offset = offset; 1389 1373 } else { 1390 1374 unsigned long seg_skip; 1391 1375
-22
io_uring/slist.h
··· 27 27 list->last = node; 28 28 } 29 29 30 - /** 31 - * wq_list_merge - merge the second list to the first one. 32 - * @list0: the first list 33 - * @list1: the second list 34 - * Return the first node after mergence. 35 - */ 36 - static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, 37 - struct io_wq_work_list *list1) 38 - { 39 - struct io_wq_work_node *ret; 40 - 41 - if (!list0->first) { 42 - ret = list1->first; 43 - } else { 44 - ret = list0->first; 45 - list0->last->next = list1->first; 46 - } 47 - INIT_WQ_LIST(list0); 48 - INIT_WQ_LIST(list1); 49 - return ret; 50 - } 51 - 52 30 static inline void wq_list_add_tail(struct io_wq_work_node *node, 53 31 struct io_wq_work_list *list) 54 32 {
+1 -1
io_uring/tctx.c
··· 83 83 84 84 xa_init(&tctx->xa); 85 85 init_waitqueue_head(&tctx->wait); 86 - atomic_set(&tctx->in_idle, 0); 86 + atomic_set(&tctx->in_cancel, 0); 87 87 atomic_set(&tctx->inflight_tracked, 0); 88 88 task->io_uring = tctx; 89 89 init_llist_head(&tctx->task_list);