Merge tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:
"A few fixes that should go into the 6.6-rc merge window:

- Fix for a regression this merge window caused by the SQPOLL
affinity patch, where we can race with SQPOLL thread shutdown and
cause an oops when trying to set affinity (Gabriel)

- Fix for a regression this merge window where fdinfo reading with
for a ring setup with IORING_SETUP_NO_SQARRAY will attempt to
deference the non-existing SQ ring array (me)

- Add the patch that allows more finegrained control over who can use
io_uring (Matteo)

- Locking fix for a regression added this merge window for IOPOLL
overflow (Pavel)

- IOPOLL fix for stable, breaking our loop if helper threads are
exiting (Pavel)

Also had a fix for unreaped iopoll requests from io-wq from Ming, but
we found an issue with that and hence it got reverted. Will get this
sorted for a future rc"

* tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux:
Revert "io_uring: fix IO hang in io_wq_put_and_exit from do_exit()"
io_uring: fix unprotected iopoll overflow
io_uring: break out of iowq iopoll on teardown
io_uring: add a sysctl to disable io_uring system-wide
io_uring/fdinfo: only print ->sq_array[] if it's there
io_uring: fix IO hang in io_wq_put_and_exit from do_exit()
io_uring: Don't set affinity on a dying sqpoll thread

Linus Torvalds 2 years ago 7ccc3ebf 32bf43e4

+99 -3

6 changed files

expand all

Documentation

admin-guide

sysctl

kernel.rst

io_uring

fdinfo.c

io-wq.c

io-wq.h

io_uring.c

sqpoll.c

+29

Documentation/admin-guide/sysctl/kernel.rst

··· 450 450 ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. 451 451 452 452 453 + io_uring_disabled 454 + ================= 455 + 456 + Prevents all processes from creating new io_uring instances. Enabling this 457 + shrinks the kernel's attack surface. 458 + 459 + = ====================================================================== 460 + 0 All processes can create io_uring instances as normal. This is the 461 + default setting. 462 + 1 io_uring creation is disabled (io_uring_setup() will fail with 463 + -EPERM) for unprivileged processes not in the io_uring_group group. 464 + Existing io_uring instances can still be used. See the 465 + documentation for io_uring_group for more information. 466 + 2 io_uring creation is disabled for all processes. io_uring_setup() 467 + always fails with -EPERM. Existing io_uring instances can still be 468 + used. 469 + = ====================================================================== 470 + 471 + 472 + io_uring_group 473 + ============== 474 + 475 + When io_uring_disabled is set to 1, a process must either be 476 + privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order 477 + to create an io_uring instance. If io_uring_group is set to -1 (the 478 + default), only processes with the CAP_SYS_ADMIN capability may create 479 + io_uring instances. 480 + 481 + 453 482 kexec_load_disabled 454 483 =================== 455 484

io_uring/fdinfo.c

··· 93 93 struct io_uring_sqe *sqe; 94 94 unsigned int sq_idx; 95 95 96 + if (ctx->flags & IORING_SETUP_NO_SQARRAY) 97 + break; 96 98 sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 97 99 if (sq_idx > sq_mask) 98 100 continue;

+10

io_uring/io-wq.c

··· 174 174 complete(&wq->worker_done); 175 175 } 176 176 177 + bool io_wq_worker_stopped(void) 178 + { 179 + struct io_worker *worker = current->worker_private; 180 + 181 + if (WARN_ON_ONCE(!io_wq_current_is_worker())) 182 + return true; 183 + 184 + return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state); 185 + } 186 + 177 187 static void io_worker_cancel_cb(struct io_worker *worker) 178 188 { 179 189 struct io_wq_acct *acct = io_wq_get_acct(worker);

io_uring/io-wq.h

··· 52 52 53 53 int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); 54 54 int io_wq_max_workers(struct io_wq *wq, int *new_count); 55 + bool io_wq_worker_stopped(void); 55 56 56 57 static inline bool io_wq_is_hashed(struct io_wq_work *work) 57 58 {

+54 -2

io_uring/io_uring.c

··· 150 150 151 151 struct kmem_cache *req_cachep; 152 152 153 + static int __read_mostly sysctl_io_uring_disabled; 154 + static int __read_mostly sysctl_io_uring_group = -1; 155 + 156 + #ifdef CONFIG_SYSCTL 157 + static struct ctl_table kernel_io_uring_disabled_table[] = { 158 + { 159 + .procname = "io_uring_disabled", 160 + .data = &sysctl_io_uring_disabled, 161 + .maxlen = sizeof(sysctl_io_uring_disabled), 162 + .mode = 0644, 163 + .proc_handler = proc_dointvec_minmax, 164 + .extra1 = SYSCTL_ZERO, 165 + .extra2 = SYSCTL_TWO, 166 + }, 167 + { 168 + .procname = "io_uring_group", 169 + .data = &sysctl_io_uring_group, 170 + .maxlen = sizeof(gid_t), 171 + .mode = 0644, 172 + .proc_handler = proc_dointvec, 173 + }, 174 + {}, 175 + }; 176 + #endif 177 + 153 178 struct sock *io_uring_get_socket(struct file *file) 154 179 { 155 180 #if defined(CONFIG_UNIX) ··· 908 883 struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; 909 884 910 885 if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { 911 - if (ctx->task_complete) { 886 + if (ctx->lockless_cq) { 912 887 spin_lock(&ctx->completion_lock); 913 888 io_cqring_event_overflow(ctx, cqe->user_data, 914 889 cqe->res, cqe->flags, 0, 0); ··· 1566 1541 1567 1542 if (!(req->flags & REQ_F_CQE_SKIP) && 1568 1543 unlikely(!io_fill_cqe_req(ctx, req))) { 1569 - if (ctx->task_complete) { 1544 + if (ctx->lockless_cq) { 1570 1545 spin_lock(&ctx->completion_lock); 1571 1546 io_req_cqe_overflow(req); 1572 1547 spin_unlock(&ctx->completion_lock); ··· 1974 1949 */ 1975 1950 if (!needs_poll) { 1976 1951 if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) 1952 + break; 1953 + if (io_wq_worker_stopped()) 1977 1954 break; 1978 1955 cond_resched(); 1979 1956 continue; ··· 4065 4038 return io_uring_create(entries, &p, params); 4066 4039 } 4067 4040 4041 + static inline bool io_uring_allowed(void) 4042 + { 4043 + int disabled = READ_ONCE(sysctl_io_uring_disabled); 4044 + kgid_t io_uring_group; 4045 + 4046 + if (disabled == 2) 4047 + return false; 4048 + 4049 + if (disabled == 0 || capable(CAP_SYS_ADMIN)) 4050 + return true; 4051 + 4052 + io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); 4053 + if (!gid_valid(io_uring_group)) 4054 + return false; 4055 + 4056 + return in_group_p(io_uring_group); 4057 + } 4058 + 4068 4059 SYSCALL_DEFINE2(io_uring_setup, u32, entries, 4069 4060 struct io_uring_params __user *, params) 4070 4061 { 4062 + if (!io_uring_allowed()) 4063 + return -EPERM; 4064 + 4071 4065 return io_uring_setup(entries, params); 4072 4066 } 4073 4067 ··· 4681 4633 SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, 4682 4634 offsetof(struct io_kiocb, cmd.data), 4683 4635 sizeof_field(struct io_kiocb, cmd.data), NULL); 4636 + 4637 + #ifdef CONFIG_SYSCTL 4638 + register_sysctl_init("kernel", kernel_io_uring_disabled_table); 4639 + #endif 4684 4640 4685 4641 return 0; 4686 4642 };

+3 -1

io_uring/sqpoll.c

··· 430 430 431 431 if (sqd) { 432 432 io_sq_thread_park(sqd); 433 - ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); 433 + /* Don't set affinity for a dying thread */ 434 + if (sqd->thread) 435 + ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); 434 436 io_sq_thread_unpark(sqd); 435 437 } 436 438

Configure Feed

Configure Feed