Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring/register: add IORING_REGISTER_RESIZE_RINGS

Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.

For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.

Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.

Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.

Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.

To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+236
+7
include/linux/io_uring_types.h
··· 416 416 unsigned evfd_last_cq_tail; 417 417 418 418 /* 419 + * Protection for resize vs mmap races - both the mmap and resize 420 + * side will need to grab this lock, to prevent either side from 421 + * being run concurrently with the other. 422 + */ 423 + struct mutex resize_lock; 424 + 425 + /* 419 426 * If IORING_SETUP_NO_MMAP is used, then the below holds 420 427 * the gup'ed pages for the two rings, and the sqes. 421 428 */
+5
include/uapi/linux/io_uring.h
··· 615 615 /* send MSG_RING without having a ring */ 616 616 IORING_REGISTER_SEND_MSG_RING = 31, 617 617 618 + /* 32 reserved for zc rx */ 619 + 620 + /* resize CQ ring */ 621 + IORING_REGISTER_RESIZE_RINGS = 33, 622 + 618 623 /* this goes last */ 619 624 IORING_REGISTER_LAST, 620 625
+1
io_uring/io_uring.c
··· 353 353 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 354 354 INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); 355 355 io_napi_init(ctx); 356 + mutex_init(&ctx->resize_lock); 356 357 357 358 return ctx; 358 359
+8
io_uring/memmap.c
··· 251 251 unsigned int npages; 252 252 void *ptr; 253 253 254 + guard(mutex)(&ctx->resize_lock); 255 + 254 256 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 255 257 if (IS_ERR(ptr)) 256 258 return PTR_ERR(ptr); ··· 276 274 unsigned long len, unsigned long pgoff, 277 275 unsigned long flags) 278 276 { 277 + struct io_ring_ctx *ctx = filp->private_data; 279 278 void *ptr; 280 279 281 280 /* ··· 286 283 */ 287 284 if (addr) 288 285 return -EINVAL; 286 + 287 + guard(mutex)(&ctx->resize_lock); 289 288 290 289 ptr = io_uring_validate_mmap_request(filp, pgoff, len); 291 290 if (IS_ERR(ptr)) ··· 334 329 unsigned long len, unsigned long pgoff, 335 330 unsigned long flags) 336 331 { 332 + struct io_ring_ctx *ctx = file->private_data; 337 333 void *ptr; 334 + 335 + guard(mutex)(&ctx->resize_lock); 338 336 339 337 ptr = io_uring_validate_mmap_request(file, pgoff, len); 340 338 if (IS_ERR(ptr))
+215
io_uring/register.c
··· 29 29 #include "napi.h" 30 30 #include "eventfd.h" 31 31 #include "msg_ring.h" 32 + #include "memmap.h" 32 33 33 34 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 34 35 IORING_REGISTER_LAST + IORING_OP_LAST) ··· 362 361 return 0; 363 362 } 364 363 364 + /* 365 + * State to maintain until we can swap. Both new and old state, used for 366 + * either mapping or freeing. 367 + */ 368 + struct io_ring_ctx_rings { 369 + unsigned short n_ring_pages; 370 + unsigned short n_sqe_pages; 371 + struct page **ring_pages; 372 + struct page **sqe_pages; 373 + struct io_uring_sqe *sq_sqes; 374 + struct io_rings *rings; 375 + }; 376 + 377 + static void io_register_free_rings(struct io_uring_params *p, 378 + struct io_ring_ctx_rings *r) 379 + { 380 + if (!(p->flags & IORING_SETUP_NO_MMAP)) { 381 + io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, 382 + true); 383 + io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, 384 + true); 385 + } else { 386 + io_pages_free(&r->ring_pages, r->n_ring_pages); 387 + io_pages_free(&r->sqe_pages, r->n_sqe_pages); 388 + vunmap(r->rings); 389 + vunmap(r->sq_sqes); 390 + } 391 + } 392 + 393 + #define swap_old(ctx, o, n, field) \ 394 + do { \ 395 + (o).field = (ctx)->field; \ 396 + (ctx)->field = (n).field; \ 397 + } while (0) 398 + 399 + #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 400 + #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 401 + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 402 + 403 + static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 404 + { 405 + struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 406 + size_t size, sq_array_offset; 407 + struct io_uring_params p; 408 + unsigned i, tail; 409 + void *ptr; 410 + int ret; 411 + 412 + /* for single issuer, must be owner resizing */ 413 + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && 414 + current != ctx->submitter_task) 415 + return -EEXIST; 416 + if (copy_from_user(&p, arg, sizeof(p))) 417 + return -EFAULT; 418 + if (p.flags & ~RESIZE_FLAGS) 419 + return -EINVAL; 420 + 421 + /* properties that are always inherited */ 422 + p.flags |= (ctx->flags & COPY_FLAGS); 423 + 424 + ret = io_uring_fill_params(p.sq_entries, &p); 425 + if (unlikely(ret)) 426 + return ret; 427 + 428 + /* nothing to do, but copy params back */ 429 + if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { 430 + if (copy_to_user(arg, &p, sizeof(p))) 431 + return -EFAULT; 432 + return 0; 433 + } 434 + 435 + size = rings_size(p.flags, p.sq_entries, p.cq_entries, 436 + &sq_array_offset); 437 + if (size == SIZE_MAX) 438 + return -EOVERFLOW; 439 + 440 + if (!(p.flags & IORING_SETUP_NO_MMAP)) 441 + n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); 442 + else 443 + n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, 444 + p.cq_off.user_addr, size); 445 + if (IS_ERR(n.rings)) 446 + return PTR_ERR(n.rings); 447 + 448 + n.rings->sq_ring_mask = p.sq_entries - 1; 449 + n.rings->cq_ring_mask = p.cq_entries - 1; 450 + n.rings->sq_ring_entries = p.sq_entries; 451 + n.rings->cq_ring_entries = p.cq_entries; 452 + 453 + if (copy_to_user(arg, &p, sizeof(p))) { 454 + io_register_free_rings(&p, &n); 455 + return -EFAULT; 456 + } 457 + 458 + if (p.flags & IORING_SETUP_SQE128) 459 + size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 460 + else 461 + size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 462 + if (size == SIZE_MAX) { 463 + io_register_free_rings(&p, &n); 464 + return -EOVERFLOW; 465 + } 466 + 467 + if (!(p.flags & IORING_SETUP_NO_MMAP)) 468 + ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); 469 + else 470 + ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, 471 + p.sq_off.user_addr, 472 + size); 473 + if (IS_ERR(ptr)) { 474 + io_register_free_rings(&p, &n); 475 + return PTR_ERR(ptr); 476 + } 477 + 478 + /* 479 + * If using SQPOLL, park the thread 480 + */ 481 + if (ctx->sq_data) { 482 + mutex_unlock(&ctx->uring_lock); 483 + io_sq_thread_park(ctx->sq_data); 484 + mutex_lock(&ctx->uring_lock); 485 + } 486 + 487 + /* 488 + * We'll do the swap. Grab the ctx->resize_lock, which will exclude 489 + * any new mmap's on the ring fd. Clear out existing mappings to prevent 490 + * mmap from seeing them, as we'll unmap them. Any attempt to mmap 491 + * existing rings beyond this point will fail. Not that it could proceed 492 + * at this point anyway, as the io_uring mmap side needs go grab the 493 + * ctx->resize_lock as well. Likewise, hold the completion lock over the 494 + * duration of the actual swap. 495 + */ 496 + mutex_lock(&ctx->resize_lock); 497 + spin_lock(&ctx->completion_lock); 498 + o.rings = ctx->rings; 499 + ctx->rings = NULL; 500 + o.sq_sqes = ctx->sq_sqes; 501 + ctx->sq_sqes = NULL; 502 + 503 + /* 504 + * Now copy SQ and CQ entries, if any. If either of the destination 505 + * rings can't hold what is already there, then fail the operation. 506 + */ 507 + n.sq_sqes = ptr; 508 + tail = o.rings->sq.tail; 509 + if (tail - o.rings->sq.head > p.sq_entries) 510 + goto overflow; 511 + for (i = o.rings->sq.head; i < tail; i++) { 512 + unsigned src_head = i & (ctx->sq_entries - 1); 513 + unsigned dst_head = i & n.rings->sq_ring_mask; 514 + 515 + n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 516 + } 517 + n.rings->sq.head = o.rings->sq.head; 518 + n.rings->sq.tail = o.rings->sq.tail; 519 + 520 + tail = o.rings->cq.tail; 521 + if (tail - o.rings->cq.head > p.cq_entries) { 522 + overflow: 523 + /* restore old rings, and return -EOVERFLOW via cleanup path */ 524 + ctx->rings = o.rings; 525 + ctx->sq_sqes = o.sq_sqes; 526 + to_free = &n; 527 + ret = -EOVERFLOW; 528 + goto out; 529 + } 530 + for (i = o.rings->cq.head; i < tail; i++) { 531 + unsigned src_head = i & (ctx->cq_entries - 1); 532 + unsigned dst_head = i & n.rings->cq_ring_mask; 533 + 534 + n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 535 + } 536 + n.rings->cq.head = o.rings->cq.head; 537 + n.rings->cq.tail = o.rings->cq.tail; 538 + /* invalidate cached cqe refill */ 539 + ctx->cqe_cached = ctx->cqe_sentinel = NULL; 540 + 541 + n.rings->sq_dropped = o.rings->sq_dropped; 542 + n.rings->sq_flags = o.rings->sq_flags; 543 + n.rings->cq_flags = o.rings->cq_flags; 544 + n.rings->cq_overflow = o.rings->cq_overflow; 545 + 546 + /* all done, store old pointers and assign new ones */ 547 + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 548 + ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 549 + 550 + ctx->sq_entries = p.sq_entries; 551 + ctx->cq_entries = p.cq_entries; 552 + 553 + ctx->rings = n.rings; 554 + ctx->sq_sqes = n.sq_sqes; 555 + swap_old(ctx, o, n, n_ring_pages); 556 + swap_old(ctx, o, n, n_sqe_pages); 557 + swap_old(ctx, o, n, ring_pages); 558 + swap_old(ctx, o, n, sqe_pages); 559 + to_free = &o; 560 + ret = 0; 561 + out: 562 + spin_unlock(&ctx->completion_lock); 563 + mutex_unlock(&ctx->resize_lock); 564 + io_register_free_rings(&p, to_free); 565 + 566 + if (ctx->sq_data) 567 + io_sq_thread_unpark(ctx->sq_data); 568 + 569 + return ret; 570 + } 571 + 365 572 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 366 573 void __user *arg, unsigned nr_args) 367 574 __releases(ctx->uring_lock) ··· 757 548 if (!arg || nr_args != 1) 758 549 break; 759 550 ret = io_register_clone_buffers(ctx, arg); 551 + break; 552 + case IORING_REGISTER_RESIZE_RINGS: 553 + ret = -EINVAL; 554 + if (!arg || nr_args != 1) 555 + break; 556 + ret = io_register_resize_rings(ctx, arg); 760 557 break; 761 558 default: 762 559 ret = -EINVAL;