Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

- Fix for a NOMMU bug with io_uring, where NOMMU doesn't grab page refs
at mmap time. NOMMU also has entirely broken FOLL_PIN support, yet
here we are

- A few fixes covering minor issues introduced in this merge window

- data race annotation to shut up KCSAN for when io-wq limits are
applied

- A nospec addition for direct descriptor file updating. Rest of the
direct descriptor path already had this, but for some reason the
update did not. Now they are all the same

- Various minor defensive changes that claude identified and suggested
terrible fixes for, turned into actually useful cleanups:

- Use kvfree() for the imu cache. These can come from kmalloc or
vmalloc depending on size, but the in-cache ones are capped
where it's always kmalloc based. Change to kvfree() in the
cleanup path, making future changes unlikely to mess that up

- Negative kbuf consumption lengths. Can't happen right now, but
cqe->res is used directly, which if other codes changes could
then be an error value

- Fix for an issue with the futex code, where partial wakes on a
vectored fuxes would potentially wake the same futex twice, rather
than move on to the next one. This could confuse an application as it
would've expected the next futex to have been woken

- Fix for a bug with ring resizing, where SQEs or CQEs might not have
been copied correctly if large SQEs or CQEs are used in the ring.
Application side issue, where SQEs or CQEs might have been lost
during resize

- Fix for a bug where EPOLL_URING_WAKE might have been lost, causing a
multishot poll to not be terminated when it's nested, like it should
have been

- Fix for an issue with signed comparison of poll references for the
slow path

- Fix for a user struct UAF in the zcrx code

- Two minor zcrx cleanups

* tag 'io_uring-7.1-20260424' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
io_uring: take page references for NOMMU pbuf_ring mmaps
io_uring/poll: ensure EPOLL_ONESHOT is propagated for EPOLL_URING_WAKE
io_uring/zcrx: warn on freelist violations
io_uring/zcrx: clear RQ headers on init
io_uring/zcrx: fix user_struct uaf
io_uring/register: fix ring resizing with mixed/large SQEs/CQEs
io_uring/futex: ensure partial wakes are appropriately dequeued
io_uring/rw: add defensive hardening for negative kbuf lengths
io_uring/rsrc: use kvfree() for the imu cache
io_uring/rsrc: unify nospec indexing for direct descriptors
io_uring: fix spurious fput in registered ring path
io_uring: fix iowq_limits data race in tctx node addition
io_uring/tctx: mark io_wq as exiting before error path teardown
io_uring/tctx: check for setup tctx->io_wq before teardown
io_uring/poll: fix signed comparison in io_poll_get_ownership()

+109 -22
+1 -1
io_uring/alloc_cache.h
··· 64 64 static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) 65 65 { 66 66 if (!io_alloc_cache_put(cache, obj)) 67 - kfree(obj); 67 + kvfree(obj); 68 68 } 69 69 70 70 #endif
+3 -1
io_uring/futex.c
··· 159 159 struct io_kiocb *req = q->wake_data; 160 160 struct io_futexv_data *ifd = req->async_data; 161 161 162 - if (!io_futexv_claim(ifd)) 162 + if (!io_futexv_claim(ifd)) { 163 + __futex_wake_mark(q); 163 164 return; 165 + } 164 166 if (unlikely(!__futex_wake_mark(q))) 165 167 return; 166 168
+2 -1
io_uring/io_uring.c
··· 2575 2575 return ERR_PTR(-EBADF); 2576 2576 if (io_is_uring_fops(file)) 2577 2577 return file; 2578 - fput(file); 2578 + if (!registered) 2579 + fput(file); 2579 2580 return ERR_PTR(-EOPNOTSUPP); 2580 2581 } 2581 2582
+45 -1
io_uring/memmap.c
··· 366 366 367 367 #else /* !CONFIG_MMU */ 368 368 369 + /* 370 + * Drop the pages that were initially referenced and added in 371 + * io_uring_mmap(). We cannot have had a mremap() as that isn't supported, 372 + * hence the vma should be identical to the one we initially referenced and 373 + * mapped, and partial unmaps and splitting isn't possible on a file backed 374 + * mapping. 375 + */ 376 + static void io_uring_nommu_vm_close(struct vm_area_struct *vma) 377 + { 378 + unsigned long index; 379 + 380 + for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE) 381 + put_page(virt_to_page((void *) index)); 382 + } 383 + 384 + static const struct vm_operations_struct io_uring_nommu_vm_ops = { 385 + .close = io_uring_nommu_vm_close, 386 + }; 387 + 369 388 int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 370 389 { 371 - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; 390 + struct io_ring_ctx *ctx = file->private_data; 391 + struct io_mapped_region *region; 392 + unsigned long i; 393 + 394 + if (!is_nommu_shared_mapping(vma->vm_flags)) 395 + return -EINVAL; 396 + 397 + guard(mutex)(&ctx->mmap_lock); 398 + region = io_mmap_get_region(ctx, vma->vm_pgoff); 399 + if (!region || !io_region_is_set(region)) 400 + return -EINVAL; 401 + 402 + if ((vma->vm_end - vma->vm_start) != 403 + (unsigned long) region->nr_pages << PAGE_SHIFT) 404 + return -EINVAL; 405 + 406 + /* 407 + * Pin the pages so io_free_region()'s release_pages() does not 408 + * drop the last reference while this VMA exists. delete_vma() 409 + * in mm/nommu.c calls vma_close() which runs ->close above. 410 + */ 411 + for (i = 0; i < region->nr_pages; i++) 412 + get_page(region->pages[i]); 413 + 414 + vma->vm_ops = &io_uring_nommu_vm_ops; 415 + return 0; 372 416 } 373 417 374 418 unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
+4 -2
io_uring/poll.c
··· 93 93 */ 94 94 static inline bool io_poll_get_ownership(struct io_kiocb *req) 95 95 { 96 - if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) 96 + if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) 97 97 return io_poll_get_ownership_slowpath(req); 98 98 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); 99 99 } ··· 417 417 * disable multishot as there is a circular dependency between 418 418 * CQ posting and triggering the event. 419 419 */ 420 - if (mask & EPOLL_URING_WAKE) 420 + if (mask & EPOLL_URING_WAKE) { 421 421 poll->events |= EPOLLONESHOT; 422 + req->apoll_events |= EPOLLONESHOT; 423 + } 422 424 423 425 /* optional, saves extra locking for removal in tw handler */ 424 426 if (mask && poll->events & EPOLLONESHOT) {
+26 -6
io_uring/register.c
··· 599 599 if (tail - old_head > p->sq_entries) 600 600 goto overflow; 601 601 for (i = old_head; i < tail; i++) { 602 - unsigned src_head = i & (ctx->sq_entries - 1); 603 - unsigned dst_head = i & (p->sq_entries - 1); 602 + unsigned index, dst_mask, src_mask; 603 + size_t sq_size; 604 604 605 - n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 605 + index = i; 606 + sq_size = sizeof(struct io_uring_sqe); 607 + src_mask = ctx->sq_entries - 1; 608 + dst_mask = p->sq_entries - 1; 609 + if (ctx->flags & IORING_SETUP_SQE128) { 610 + index <<= 1; 611 + sq_size <<= 1; 612 + src_mask = (ctx->sq_entries << 1) - 1; 613 + dst_mask = (p->sq_entries << 1) - 1; 614 + } 615 + memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size); 606 616 } 607 617 WRITE_ONCE(n.rings->sq.head, old_head); 608 618 WRITE_ONCE(n.rings->sq.tail, tail); ··· 629 619 goto out; 630 620 } 631 621 for (i = old_head; i < tail; i++) { 632 - unsigned src_head = i & (ctx->cq_entries - 1); 633 - unsigned dst_head = i & (p->cq_entries - 1); 622 + unsigned index, dst_mask, src_mask; 623 + size_t cq_size; 634 624 635 - n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 625 + index = i; 626 + cq_size = sizeof(struct io_uring_cqe); 627 + src_mask = ctx->cq_entries - 1; 628 + dst_mask = p->cq_entries - 1; 629 + if (ctx->flags & IORING_SETUP_CQE32) { 630 + index <<= 1; 631 + cq_size <<= 1; 632 + src_mask = (ctx->cq_entries << 1) - 1; 633 + dst_mask = (p->cq_entries << 1) - 1; 634 + } 635 + memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size); 636 636 } 637 637 WRITE_ONCE(n.rings->cq.head, old_head); 638 638 WRITE_ONCE(n.rings->cq.tail, tail);
+4 -1
io_uring/rsrc.c
··· 168 168 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 169 169 { 170 170 io_alloc_cache_free(&ctx->node_cache, kfree); 171 - io_alloc_cache_free(&ctx->imu_cache, kfree); 171 + io_alloc_cache_free(&ctx->imu_cache, kvfree); 172 172 } 173 173 174 174 static void io_clear_table_tags(struct io_rsrc_data *data) ··· 238 238 continue; 239 239 240 240 i = up->offset + done; 241 + if (i >= ctx->file_table.data.nr) 242 + break; 243 + i = array_index_nospec(i, ctx->file_table.data.nr); 241 244 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 242 245 io_file_bitmap_clear(&ctx->file_table, i); 243 246
+7 -2
io_uring/rsrc.h
··· 109 109 } 110 110 111 111 static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, 112 - struct io_rsrc_data *data, int index) 112 + struct io_rsrc_data *data, 113 + unsigned int index) 113 114 { 114 - struct io_rsrc_node *node = data->nodes[index]; 115 + struct io_rsrc_node *node; 115 116 117 + if (index >= data->nr) 118 + return false; 119 + index = array_index_nospec(index, data->nr); 120 + node = data->nodes[index]; 116 121 if (!node) 117 122 return false; 118 123 io_put_rsrc_node(ctx, node);
+2 -2
io_uring/rw.c
··· 580 580 io_req_io_end(req); 581 581 582 582 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 583 - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 583 + req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL); 584 584 585 585 io_req_rw_cleanup(req, 0); 586 586 io_req_task_complete(tw_req, tw); ··· 1379 1379 list_del(&req->iopoll_node); 1380 1380 wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs); 1381 1381 nr_events++; 1382 - req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); 1382 + req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL); 1383 1383 if (!io_is_uring_cmd(req)) 1384 1384 io_req_rw_cleanup(req, 0); 1385 1385 }
+11 -4
io_uring/tctx.c
··· 146 146 if (IS_ERR(tctx)) 147 147 return PTR_ERR(tctx); 148 148 149 - if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { 150 - unsigned int limits[2] = { ctx->iowq_limits[0], 151 - ctx->iowq_limits[1], }; 149 + if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) { 150 + unsigned int limits[2]; 151 + 152 + mutex_lock(&ctx->uring_lock); 153 + limits[0] = ctx->iowq_limits[0]; 154 + limits[1] = ctx->iowq_limits[1]; 155 + mutex_unlock(&ctx->uring_lock); 152 156 153 157 ret = io_wq_max_workers(tctx->io_wq, limits); 154 158 if (ret) ··· 175 171 } 176 172 if (!current->io_uring) { 177 173 err_free: 178 - io_wq_put_and_exit(tctx->io_wq); 174 + if (tctx->io_wq) { 175 + io_wq_exit_start(tctx->io_wq); 176 + io_wq_put_and_exit(tctx->io_wq); 177 + } 179 178 percpu_counter_destroy(&tctx->inflight); 180 179 kfree(tctx); 181 180 }
+4 -1
io_uring/zcrx.c
··· 396 396 ifq->rq.ring = (struct io_uring *)ptr; 397 397 ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 398 398 399 + memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring)); 399 400 return 0; 400 401 } 401 402 ··· 580 579 581 580 if (ifq->area) 582 581 io_zcrx_free_area(ifq, ifq->area); 583 - free_uid(ifq->user); 584 582 if (ifq->mm_account) 585 583 mmdrop(ifq->mm_account); 586 584 if (ifq->dev) 587 585 put_device(ifq->dev); 588 586 589 587 io_free_rbuf_ring(ifq); 588 + free_uid(ifq->user); 590 589 mutex_destroy(&ifq->pp_lock); 591 590 kfree(ifq); 592 591 } ··· 602 601 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 603 602 604 603 guard(spinlock_bh)(&area->freelist_lock); 604 + if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs)) 605 + return; 605 606 area->freelist[area->free_count++] = net_iov_idx(niov); 606 607 } 607 608