io_uring/tctx: add separate lock for list of tctx's in ctx

ctx->tcxt_list holds the tasks using this ring, and it's currently
protected by the normal ctx->uring_lock. However, this can cause a
circular locking issue, as reported by syzbot, where cancelations off
exec end up needing to remove an entry from this list:

======================================================
WARNING: possible circular locking dependency detected
syzkaller #0 Tainted: G L
------------------------------------------------------
syz.0.9999/12287 is trying to acquire lock:
ffff88805851c0a8 (&ctx->uring_lock){+.+.}-{4:4}, at: io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179

but task is already holding lock:
ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline]
ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&sig->cred_guard_mutex){+.+.}-{4:4}:
__mutex_lock_common kernel/locking/mutex.c:614 [inline]
__mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
proc_pid_attr_write+0x547/0x630 fs/proc/base.c:2837
vfs_write+0x27e/0xb30 fs/read_write.c:684
ksys_write+0x145/0x250 fs/read_write.c:738
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #1 (sb_writers#3){.+.+}-{0:0}:
percpu_down_read_internal include/linux/percpu-rwsem.h:53 [inline]
percpu_down_read_freezable include/linux/percpu-rwsem.h:83 [inline]
__sb_start_write include/linux/fs/super.h:19 [inline]
sb_start_write+0x4d/0x1c0 include/linux/fs/super.h:125
mnt_want_write+0x41/0x90 fs/namespace.c:499
open_last_lookups fs/namei.c:4529 [inline]
path_openat+0xadd/0x3dd0 fs/namei.c:4784
do_filp_open+0x1fa/0x410 fs/namei.c:4814
io_openat2+0x3e0/0x5c0 io_uring/openclose.c:143
__io_issue_sqe+0x181/0x4b0 io_uring/io_uring.c:1792
io_issue_sqe+0x165/0x1060 io_uring/io_uring.c:1815
io_queue_sqe io_uring/io_uring.c:2042 [inline]
io_submit_sqe io_uring/io_uring.c:2320 [inline]
io_submit_sqes+0xbf4/0x2140 io_uring/io_uring.c:2434
__do_sys_io_uring_enter io_uring/io_uring.c:3280 [inline]
__se_sys_io_uring_enter+0x2e0/0x2b60 io_uring/io_uring.c:3219
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&ctx->uring_lock){+.+.}-{4:4}:
check_prev_add kernel/locking/lockdep.c:3165 [inline]
check_prevs_add kernel/locking/lockdep.c:3284 [inline]
validate_chain kernel/locking/lockdep.c:3908 [inline]
__lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237
lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868
__mutex_lock_common kernel/locking/mutex.c:614 [inline]
__mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179
io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195
io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646
io_uring_task_cancel include/linux/io_uring.h:24 [inline]
begin_new_exec+0x10ed/0x2440 fs/exec.c:1131
load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010
search_binary_handler fs/exec.c:1669 [inline]
exec_binprm fs/exec.c:1701 [inline]
bprm_execve+0x92e/0x1400 fs/exec.c:1753
do_execveat_common+0x510/0x6a0 fs/exec.c:1859
do_execve fs/exec.c:1933 [inline]
__do_sys_execve fs/exec.c:2009 [inline]
__se_sys_execve fs/exec.c:2004 [inline]
__x64_sys_execve+0x94/0xb0 fs/exec.c:2004
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f

other info that might help us debug this:

Chain exists of:
&ctx->uring_lock --> sb_writers#3 --> &sig->cred_guard_mutex

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&sig->cred_guard_mutex);
lock(sb_writers#3);
lock(&sig->cred_guard_mutex);
lock(&ctx->uring_lock);

*** DEADLOCK ***

1 lock held by syz.0.9999/12287:
#0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline]
#0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733

stack backtrace:
CPU: 0 UID: 0 PID: 12287 Comm: syz.0.9999 Tainted: G L syzkaller #0 PREEMPT(full)
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025
Call Trace:
<TASK>
dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
print_circular_bug+0x2e2/0x300 kernel/locking/lockdep.c:2043
check_noncircular+0x12e/0x150 kernel/locking/lockdep.c:2175
check_prev_add kernel/locking/lockdep.c:3165 [inline]
check_prevs_add kernel/locking/lockdep.c:3284 [inline]
validate_chain kernel/locking/lockdep.c:3908 [inline]
__lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237
lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868
__mutex_lock_common kernel/locking/mutex.c:614 [inline]
__mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179
io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195
io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646
io_uring_task_cancel include/linux/io_uring.h:24 [inline]
begin_new_exec+0x10ed/0x2440 fs/exec.c:1131
load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010
search_binary_handler fs/exec.c:1669 [inline]
exec_binprm fs/exec.c:1701 [inline]
bprm_execve+0x92e/0x1400 fs/exec.c:1753
do_execveat_common+0x510/0x6a0 fs/exec.c:1859
do_execve fs/exec.c:1933 [inline]
__do_sys_execve fs/exec.c:2009 [inline]
__se_sys_execve fs/exec.c:2004 [inline]
__x64_sys_execve+0x94/0xb0 fs/exec.c:2004
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff3a8b8f749
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ff3a9a97038 EFLAGS: 00000246 ORIG_RAX: 000000000000003b
RAX: ffffffffffffffda RBX: 00007ff3a8de5fa0 RCX: 00007ff3a8b8f749
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000400
RBP: 00007ff3a8c13f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ff3a8de6038 R14: 00007ff3a8de5fa0 R15: 00007ff3a8f0fa28
</TASK>

Add a separate lock just for the tctx_list, tctx_lock. This can nest
under ->uring_lock, where necessary, and be used separately for list
manipulation. For the cancelation off exec side, this removes the
need to grab ->uring_lock, hence fixing the circular locking
dependency.

Reported-by: syzbot+b0e3b77ffaa8a4067ce5@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Jens Axboe 5 months ago 5623eb1e fc5ff250

+23 -5

5 changed files

expand all

include

linux

io_uring_types.h

io_uring

cancel.c

io_uring.c

tctx.c

+7 -1

include/linux/io_uring_types.h

··· 424 424 struct user_struct *user; 425 425 struct mm_struct *mm_account; 426 426 427 + /* 428 + * List of tctx nodes for this ctx, protected by tctx_lock. For 429 + * cancelation purposes, nests under uring_lock. 430 + */ 431 + struct list_head tctx_list; 432 + struct mutex tctx_lock; 433 + 427 434 /* ctx exit and cancelation */ 428 435 struct llist_head fallback_llist; 429 436 struct delayed_work fallback_work; 430 437 struct work_struct exit_work; 431 - struct list_head tctx_list; 432 438 struct completion ref_comp; 433 439 434 440 /* io-wq management, e.g. thread count */

io_uring/cancel.c

··· 184 184 } while (1); 185 185 186 186 /* slow path, try all io-wq's */ 187 + __set_current_state(TASK_RUNNING); 187 188 io_ring_submit_lock(ctx, issue_flags); 189 + mutex_lock(&ctx->tctx_lock); 188 190 ret = -ENOENT; 189 191 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 190 192 ret = io_async_cancel_one(node->task->io_uring, cd); ··· 196 194 nr++; 197 195 } 198 196 } 197 + mutex_unlock(&ctx->tctx_lock); 199 198 io_ring_submit_unlock(ctx, issue_flags); 200 199 return all ? nr : ret; 201 200 } ··· 487 484 bool ret = false; 488 485 489 486 mutex_lock(&ctx->uring_lock); 487 + mutex_lock(&ctx->tctx_lock); 490 488 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 491 489 struct io_uring_task *tctx = node->task->io_uring; 492 490 ··· 500 496 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 501 497 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 502 498 } 499 + mutex_unlock(&ctx->tctx_lock); 503 500 mutex_unlock(&ctx->uring_lock); 504 501 505 502 return ret;

io_uring/io_uring.c

··· 340 340 INIT_LIST_HEAD(&ctx->ltimeout_list); 341 341 init_llist_head(&ctx->work_llist); 342 342 INIT_LIST_HEAD(&ctx->tctx_list); 343 + mutex_init(&ctx->tctx_lock); 343 344 ctx->submit_state.free_list.next = NULL; 344 345 INIT_HLIST_HEAD(&ctx->waitid_list); 345 346 xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); ··· 3046 3045 exit.ctx = ctx; 3047 3046 3048 3047 mutex_lock(&ctx->uring_lock); 3048 + mutex_lock(&ctx->tctx_lock); 3049 3049 while (!list_empty(&ctx->tctx_list)) { 3050 3050 WARN_ON_ONCE(time_after(jiffies, timeout)); 3051 3051 ··· 3058 3056 if (WARN_ON_ONCE(ret)) 3059 3057 continue; 3060 3058 3059 + mutex_unlock(&ctx->tctx_lock); 3061 3060 mutex_unlock(&ctx->uring_lock); 3062 3061 /* 3063 3062 * See comment above for ··· 3067 3064 */ 3068 3065 wait_for_completion_interruptible(&exit.completion); 3069 3066 mutex_lock(&ctx->uring_lock); 3067 + mutex_lock(&ctx->tctx_lock); 3070 3068 } 3069 + mutex_unlock(&ctx->tctx_lock); 3071 3070 mutex_unlock(&ctx->uring_lock); 3072 3071 spin_lock(&ctx->completion_lock); 3073 3072 spin_unlock(&ctx->completion_lock);

io_uring/register.c

··· 320 320 return 0; 321 321 322 322 /* now propagate the restriction to all registered users */ 323 + mutex_lock(&ctx->tctx_lock); 323 324 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 324 325 tctx = node->task->io_uring; 325 326 if (WARN_ON_ONCE(!tctx->io_wq)) ··· 331 330 /* ignore errors, it always returns zero anyway */ 332 331 (void)io_wq_max_workers(tctx->io_wq, new_count); 333 332 } 333 + mutex_unlock(&ctx->tctx_lock); 334 334 return 0; 335 335 err: 336 336 if (sqd) {

+4 -4

io_uring/tctx.c

··· 136 136 return ret; 137 137 } 138 138 139 - mutex_lock(&ctx->uring_lock); 139 + mutex_lock(&ctx->tctx_lock); 140 140 list_add(&node->ctx_node, &ctx->tctx_list); 141 - mutex_unlock(&ctx->uring_lock); 141 + mutex_unlock(&ctx->tctx_lock); 142 142 } 143 143 return 0; 144 144 } ··· 176 176 WARN_ON_ONCE(current != node->task); 177 177 WARN_ON_ONCE(list_empty(&node->ctx_node)); 178 178 179 - mutex_lock(&node->ctx->uring_lock); 179 + mutex_lock(&node->ctx->tctx_lock); 180 180 list_del(&node->ctx_node); 181 - mutex_unlock(&node->ctx->uring_lock); 181 + mutex_unlock(&node->ctx->tctx_lock); 182 182 183 183 if (tctx->last == node->ctx) 184 184 tctx->last = NULL;

Configure Feed

Configure Feed