Merge tag 'for-6.8/io_uring-2024-01-18' of git://git.kernel.dk/linux

+45 -18

io_uring/io_uring.c

··· 137 137 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 138 138 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 139 139 140 + /* 141 + * No waiters. It's larger than any valid value of the tw counter 142 + * so that tests against ->cq_wait_nr would fail and skip wake_up(). 143 + */ 144 + #define IO_CQ_WAKE_INIT (-1U) 145 + /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 146 + #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 147 + 140 148 static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 141 149 struct task_struct *task, 142 150 bool cancel_all); ··· 311 303 goto err; 312 304 313 305 ctx->flags = p->flags; 306 + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 314 307 init_waitqueue_head(&ctx->sqo_sq_wait); 315 308 INIT_LIST_HEAD(&ctx->sqd_list); 316 309 INIT_LIST_HEAD(&ctx->cq_overflow_list); ··· 1313 1304 { 1314 1305 struct io_ring_ctx *ctx = req->ctx; 1315 1306 unsigned nr_wait, nr_tw, nr_tw_prev; 1316 - struct llist_node *first; 1307 + struct llist_node *head; 1317 1308 1309 + /* See comment above IO_CQ_WAKE_INIT */ 1310 + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 1311 + 1312 + /* 1313 + * We don't know how many reuqests is there in the link and whether 1314 + * they can even be queued lazily, fall back to non-lazy. 1315 + */ 1318 1316 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 1319 1317 flags &= ~IOU_F_TWQ_LAZY_WAKE; 1320 1318 1321 - first = READ_ONCE(ctx->work_llist.first); 1319 + head = READ_ONCE(ctx->work_llist.first); 1322 1320 do { 1323 1321 nr_tw_prev = 0; 1324 - if (first) { 1325 - struct io_kiocb *first_req = container_of(first, 1322 + if (head) { 1323 + struct io_kiocb *first_req = container_of(head, 1326 1324 struct io_kiocb, 1327 1325 io_task_work.node); 1328 1326 /* ··· 1338 1322 */ 1339 1323 nr_tw_prev = READ_ONCE(first_req->nr_tw); 1340 1324 } 1325 + 1326 + /* 1327 + * Theoretically, it can overflow, but that's fine as one of 1328 + * previous adds should've tried to wake the task. 1329 + */ 1341 1330 nr_tw = nr_tw_prev + 1; 1342 - /* Large enough to fail the nr_wait comparison below */ 1343 1331 if (!(flags & IOU_F_TWQ_LAZY_WAKE)) 1344 - nr_tw = -1U; 1332 + nr_tw = IO_CQ_WAKE_FORCE; 1345 1333 1346 1334 req->nr_tw = nr_tw; 1347 - req->io_task_work.node.next = first; 1348 - } while (!try_cmpxchg(&ctx->work_llist.first, &first, 1335 + req->io_task_work.node.next = head; 1336 + } while (!try_cmpxchg(&ctx->work_llist.first, &head, 1349 1337 &req->io_task_work.node)); 1350 1338 1351 - if (!first) { 1339 + /* 1340 + * cmpxchg implies a full barrier, which pairs with the barrier 1341 + * in set_current_state() on the io_cqring_wait() side. It's used 1342 + * to ensure that either we see updated ->cq_wait_nr, or waiters 1343 + * going to sleep will observe the work added to the list, which 1344 + * is similar to the wait/wawke task state sync. 1345 + */ 1346 + 1347 + if (!head) { 1352 1348 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1353 1349 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1354 1350 if (ctx->has_evfd) ··· 1368 1340 } 1369 1341 1370 1342 nr_wait = atomic_read(&ctx->cq_wait_nr); 1371 - /* no one is waiting */ 1372 - if (!nr_wait) 1343 + /* not enough or no one is waiting */ 1344 + if (nr_tw < nr_wait) 1373 1345 return; 1374 - /* either not enough or the previous add has already woken it up */ 1375 - if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) 1346 + /* the previous add has already woken it up */ 1347 + if (nr_tw_prev >= nr_wait) 1376 1348 return; 1377 - /* pairs with set_current_state() in io_cqring_wait() */ 1378 - smp_mb__after_atomic(); 1379 1349 wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 1380 1350 } 1381 1351 ··· 2026 2000 goto out; 2027 2001 fd = array_index_nospec(fd, ctx->nr_user_files); 2028 2002 slot = io_fixed_file_slot(&ctx->file_table, fd); 2029 - file = io_slot_file(slot); 2003 + if (!req->rsrc_node) 2004 + __io_req_set_rsrc_node(req, ctx); 2030 2005 req->flags |= io_slot_flags(slot); 2031 - io_req_set_rsrc_node(req, ctx, 0); 2006 + file = io_slot_file(slot); 2032 2007 out: 2033 2008 io_ring_submit_unlock(ctx, issue_flags); 2034 2009 return file; ··· 2640 2613 2641 2614 ret = io_cqring_wait_schedule(ctx, &iowq); 2642 2615 __set_current_state(TASK_RUNNING); 2643 - atomic_set(&ctx->cq_wait_nr, 0); 2616 + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 2644 2617 2645 2618 /* 2646 2619 * Run task_work after scheduling and before io_should_wake().

+5 -3

io_uring/register.c

··· 14 14 #include <linux/slab.h> 15 15 #include <linux/uaccess.h> 16 16 #include <linux/nospec.h> 17 + #include <linux/compat.h> 17 18 #include <linux/io_uring.h> 18 19 #include <linux/io_uring_types.h> 19 20 ··· 279 278 if (len > cpumask_size()) 280 279 len = cpumask_size(); 281 280 282 - if (in_compat_syscall()) { 281 + #ifdef CONFIG_COMPAT 282 + if (in_compat_syscall()) 283 283 ret = compat_get_bitmap(cpumask_bits(new_mask), 284 284 (const compat_ulong_t __user *)arg, 285 285 len * 8 /* CHAR_BIT */); 286 - } else { 286 + else 287 + #endif 287 288 ret = copy_from_user(new_mask, arg, len); 288 - } 289 289 290 290 if (ret) { 291 291 free_cpumask_var(new_mask);

+9 -5

io_uring/rsrc.h

··· 102 102 node->refs++; 103 103 } 104 104 105 + static inline void __io_req_set_rsrc_node(struct io_kiocb *req, 106 + struct io_ring_ctx *ctx) 107 + { 108 + lockdep_assert_held(&ctx->uring_lock); 109 + req->rsrc_node = ctx->rsrc_node; 110 + io_charge_rsrc_node(ctx, ctx->rsrc_node); 111 + } 112 + 105 113 static inline void io_req_set_rsrc_node(struct io_kiocb *req, 106 114 struct io_ring_ctx *ctx, 107 115 unsigned int issue_flags) 108 116 { 109 117 if (!req->rsrc_node) { 110 118 io_ring_submit_lock(ctx, issue_flags); 111 - 112 - lockdep_assert_held(&ctx->uring_lock); 113 - 114 - req->rsrc_node = ctx->rsrc_node; 115 - io_charge_rsrc_node(ctx, ctx->rsrc_node); 119 + __io_req_set_rsrc_node(req, ctx); 116 120 io_ring_submit_unlock(ctx, issue_flags); 117 121 } 118 122 }

+27 -21

io_uring/rw.c

··· 168 168 kfree(io->free_iovec); 169 169 } 170 170 171 - static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 172 - { 173 - switch (ret) { 174 - case -EIOCBQUEUED: 175 - break; 176 - case -ERESTARTSYS: 177 - case -ERESTARTNOINTR: 178 - case -ERESTARTNOHAND: 179 - case -ERESTART_RESTARTBLOCK: 180 - /* 181 - * We can't just restart the syscall, since previously 182 - * submitted sqes may already be in progress. Just fail this 183 - * IO with EINTR. 184 - */ 185 - ret = -EINTR; 186 - fallthrough; 187 - default: 188 - kiocb->ki_complete(kiocb, ret); 189 - } 190 - } 191 - 192 171 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) 193 172 { 194 173 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); ··· 348 369 349 370 /* order with io_iopoll_complete() checking ->iopoll_completed */ 350 371 smp_store_release(&req->iopoll_completed, 1); 372 + } 373 + 374 + static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 375 + { 376 + /* IO was queued async, completion will happen later */ 377 + if (ret == -EIOCBQUEUED) 378 + return; 379 + 380 + /* transform internal restart error codes */ 381 + if (unlikely(ret < 0)) { 382 + switch (ret) { 383 + case -ERESTARTSYS: 384 + case -ERESTARTNOINTR: 385 + case -ERESTARTNOHAND: 386 + case -ERESTART_RESTARTBLOCK: 387 + /* 388 + * We can't just restart the syscall, since previously 389 + * submitted sqes may already be in progress. Just fail 390 + * this IO with EINTR. 391 + */ 392 + ret = -EINTR; 393 + break; 394 + } 395 + } 396 + 397 + INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, 398 + io_complete_rw, kiocb, ret); 351 399 } 352 400 353 401 static int kiocb_done(struct io_kiocb *req, ssize_t ret,

Configure Feed

Configure Feed