Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'io_uring-6.15' into for-6.16/io_uring

Merge in 6.15 io_uring fixes, mostly so that the fdinfo changes can
get easily extended without causing merge conflicts.

* io_uring-6.15:
io_uring/fdinfo: grab ctx->uring_lock around io_uring_show_fdinfo()
io_uring/memmap: don't use page_address() on a highmem page
io_uring/uring_cmd: fix hybrid polling initialization issue
io_uring/sqpoll: Increase task_work submission batch size
io_uring: ensure deferred completions are flushed for multishot
io_uring: always arm linked timeouts prior to issue
io_uring/fdinfo: annotate racy sq/cq head/tail reads
io_uring: fix 'sync' handling of io_fallback_tw()
io_uring: don't duplicate flushing in io_req_post_cqe

+72 -71
+27 -25
io_uring/fdinfo.c
··· 86 86 } 87 87 #endif 88 88 89 - /* 90 - * Caller holds a reference to the file already, we don't need to do 91 - * anything else to get an extra reference. 92 - */ 93 - __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) 89 + static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) 94 90 { 95 - struct io_ring_ctx *ctx = file->private_data; 96 91 struct io_overflow_cqe *ocqe; 97 92 struct io_rings *r = ctx->rings; 98 93 struct rusage sq_usage; ··· 101 106 unsigned int sq_entries, cq_entries; 102 107 int sq_pid = -1, sq_cpu = -1; 103 108 u64 sq_total_time = 0, sq_work_time = 0; 104 - bool has_lock; 105 109 unsigned int i; 106 110 107 111 if (ctx->flags & IORING_SETUP_CQE32) ··· 117 123 seq_printf(m, "SqMask:\t0x%x\n", sq_mask); 118 124 seq_printf(m, "SqHead:\t%u\n", sq_head); 119 125 seq_printf(m, "SqTail:\t%u\n", sq_tail); 120 - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); 126 + seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head)); 121 127 seq_printf(m, "CqMask:\t0x%x\n", cq_mask); 122 128 seq_printf(m, "CqHead:\t%u\n", cq_head); 123 129 seq_printf(m, "CqTail:\t%u\n", cq_tail); 124 - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); 130 + seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); 125 131 seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); 126 132 sq_entries = min(sq_tail - sq_head, ctx->sq_entries); 127 133 for (i = 0; i < sq_entries; i++) { ··· 170 176 seq_printf(m, "\n"); 171 177 } 172 178 173 - /* 174 - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 175 - * since fdinfo case grabs it in the opposite direction of normal use 176 - * cases. If we fail to get the lock, we just don't iterate any 177 - * structures that could be going away outside the io_uring mutex. 178 - */ 179 - has_lock = mutex_trylock(&ctx->uring_lock); 180 - 181 - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 179 + if (ctx->flags & IORING_SETUP_SQPOLL) { 182 180 struct io_sq_data *sq = ctx->sq_data; 183 181 184 182 /* ··· 192 206 seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); 193 207 seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); 194 208 seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); 195 - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { 209 + for (i = 0; i < ctx->file_table.data.nr; i++) { 196 210 struct file *f = NULL; 197 211 198 212 if (ctx->file_table.data.nodes[i]) ··· 204 218 } 205 219 } 206 220 seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); 207 - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { 221 + for (i = 0; i < ctx->buf_table.nr; i++) { 208 222 struct io_mapped_ubuf *buf = NULL; 209 223 210 224 if (ctx->buf_table.nodes[i]) ··· 214 228 else 215 229 seq_printf(m, "%5u: <none>\n", i); 216 230 } 217 - if (has_lock && !xa_empty(&ctx->personalities)) { 231 + if (!xa_empty(&ctx->personalities)) { 218 232 unsigned long index; 219 233 const struct cred *cred; 220 234 ··· 224 238 } 225 239 226 240 seq_puts(m, "PollList:\n"); 227 - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { 241 + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { 228 242 struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; 229 243 struct io_kiocb *req; 230 244 ··· 232 246 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 233 247 task_work_pending(req->tctx->task)); 234 248 } 235 - 236 - if (has_lock) 237 - mutex_unlock(&ctx->uring_lock); 238 249 239 250 seq_puts(m, "CqOverflowList:\n"); 240 251 spin_lock(&ctx->completion_lock); ··· 244 261 } 245 262 spin_unlock(&ctx->completion_lock); 246 263 napi_show_fdinfo(ctx, m); 264 + } 265 + 266 + /* 267 + * Caller holds a reference to the file already, we don't need to do 268 + * anything else to get an extra reference. 269 + */ 270 + __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) 271 + { 272 + struct io_ring_ctx *ctx = file->private_data; 273 + 274 + /* 275 + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 276 + * since fdinfo case grabs it in the opposite direction of normal use 277 + * cases. 278 + */ 279 + if (mutex_trylock(&ctx->uring_lock)) { 280 + __io_uring_show_fdinfo(ctx, m); 281 + mutex_unlock(&ctx->uring_lock); 282 + } 247 283 } 248 284 #endif
+38 -44
io_uring/io_uring.c
··· 430 430 return req->link; 431 431 } 432 432 433 - static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 434 - { 435 - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 436 - return NULL; 437 - return __io_prep_linked_timeout(req); 438 - } 439 - 440 - static noinline void __io_arm_ltimeout(struct io_kiocb *req) 441 - { 442 - io_queue_linked_timeout(__io_prep_linked_timeout(req)); 443 - } 444 - 445 - static inline void io_arm_ltimeout(struct io_kiocb *req) 446 - { 447 - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) 448 - __io_arm_ltimeout(req); 449 - } 450 - 451 433 static void io_prep_async_work(struct io_kiocb *req) 452 434 { 453 435 const struct io_issue_def *def = &io_issue_defs[req->opcode]; ··· 482 500 483 501 static void io_queue_iowq(struct io_kiocb *req) 484 502 { 485 - struct io_kiocb *link = io_prep_linked_timeout(req); 486 503 struct io_uring_task *tctx = req->tctx; 487 504 488 505 BUG_ON(!tctx); ··· 506 525 507 526 trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); 508 527 io_wq_enqueue(tctx->io_wq, &req->work); 509 - if (link) 510 - io_queue_linked_timeout(link); 511 528 } 512 529 513 530 static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) ··· 843 864 struct io_ring_ctx *ctx = req->ctx; 844 865 bool posted; 845 866 867 + /* 868 + * If multishot has already posted deferred completions, ensure that 869 + * those are flushed first before posting this one. If not, CQEs 870 + * could get reordered. 871 + */ 872 + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) 873 + __io_submit_flush_completions(ctx); 874 + 846 875 lockdep_assert(!io_wq_current_is_worker()); 847 876 lockdep_assert_held(&ctx->uring_lock); 848 877 849 - __io_cq_lock(ctx); 850 - posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); 878 + if (!ctx->lockless_cq) { 879 + spin_lock(&ctx->completion_lock); 880 + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); 881 + spin_unlock(&ctx->completion_lock); 882 + } else { 883 + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); 884 + } 885 + 851 886 ctx->submit_state.cq_flush = true; 852 - __io_cq_unlock_post(ctx); 853 887 return posted; 854 888 } 855 889 ··· 1050 1058 while (node) { 1051 1059 req = container_of(node, struct io_kiocb, io_task_work.node); 1052 1060 node = node->next; 1053 - if (sync && last_ctx != req->ctx) { 1061 + if (last_ctx != req->ctx) { 1054 1062 if (last_ctx) { 1055 - flush_delayed_work(&last_ctx->fallback_work); 1063 + if (sync) 1064 + flush_delayed_work(&last_ctx->fallback_work); 1056 1065 percpu_ref_put(&last_ctx->refs); 1057 1066 } 1058 1067 last_ctx = req->ctx; 1059 1068 percpu_ref_get(&last_ctx->refs); 1060 1069 } 1061 - if (llist_add(&req->io_task_work.node, 1062 - &req->ctx->fallback_llist)) 1063 - schedule_delayed_work(&req->ctx->fallback_work, 1); 1070 + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) 1071 + schedule_delayed_work(&last_ctx->fallback_work, 1); 1064 1072 } 1065 1073 1066 1074 if (last_ctx) { 1067 - flush_delayed_work(&last_ctx->fallback_work); 1075 + if (sync) 1076 + flush_delayed_work(&last_ctx->fallback_work); 1068 1077 percpu_ref_put(&last_ctx->refs); 1069 1078 } 1070 1079 } ··· 1677 1684 return !!req->file; 1678 1685 } 1679 1686 1687 + #define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) 1688 + 1680 1689 static inline int __io_issue_sqe(struct io_kiocb *req, 1681 1690 unsigned int issue_flags, 1682 1691 const struct io_issue_def *def) 1683 1692 { 1684 1693 const struct cred *creds = NULL; 1694 + struct io_kiocb *link = NULL; 1685 1695 int ret; 1686 1696 1687 - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) 1688 - creds = override_creds(req->creds); 1697 + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { 1698 + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) 1699 + creds = override_creds(req->creds); 1700 + if (req->flags & REQ_F_ARM_LTIMEOUT) 1701 + link = __io_prep_linked_timeout(req); 1702 + } 1689 1703 1690 1704 if (!def->audit_skip) 1691 1705 audit_uring_entry(req->opcode); ··· 1702 1702 if (!def->audit_skip) 1703 1703 audit_uring_exit(!ret, ret); 1704 1704 1705 - if (creds) 1706 - revert_creds(creds); 1705 + if (unlikely(creds || link)) { 1706 + if (creds) 1707 + revert_creds(creds); 1708 + if (link) 1709 + io_queue_linked_timeout(link); 1710 + } 1707 1711 1708 1712 return ret; 1709 1713 } ··· 1733 1729 1734 1730 if (ret == IOU_ISSUE_SKIP_COMPLETE) { 1735 1731 ret = 0; 1736 - io_arm_ltimeout(req); 1737 1732 1738 1733 /* If the op doesn't have a file, we're not polling for it */ 1739 1734 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) ··· 1786 1783 __io_req_set_refcount(req, 2); 1787 1784 else 1788 1785 req_ref_get(req); 1789 - 1790 - io_arm_ltimeout(req); 1791 1786 1792 1787 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 1793 1788 if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { ··· 1903 1902 static void io_queue_async(struct io_kiocb *req, int ret) 1904 1903 __must_hold(&req->ctx->uring_lock) 1905 1904 { 1906 - struct io_kiocb *linked_timeout; 1907 - 1908 1905 if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { 1909 1906 io_req_defer_failed(req, ret); 1910 1907 return; 1911 1908 } 1912 - 1913 - linked_timeout = io_prep_linked_timeout(req); 1914 1909 1915 1910 switch (io_arm_poll_handler(req, 0)) { 1916 1911 case IO_APOLL_READY: ··· 1920 1923 case IO_APOLL_OK: 1921 1924 break; 1922 1925 } 1923 - 1924 - if (linked_timeout) 1925 - io_queue_linked_timeout(linked_timeout); 1926 1926 } 1927 1927 1928 1928 static inline void io_queue_sqe(struct io_kiocb *req)
+1 -1
io_uring/memmap.c
··· 117 117 void *ptr; 118 118 119 119 if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { 120 - if (ifd.nr_folios == 1) { 120 + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { 121 121 mr->ptr = page_address(mr->pages[0]); 122 122 return 0; 123 123 }
+1 -1
io_uring/sqpoll.c
··· 20 20 #include "sqpoll.h" 21 21 22 22 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 23 - #define IORING_TW_CAP_ENTRIES_VALUE 8 23 + #define IORING_TW_CAP_ENTRIES_VALUE 32 24 24 25 25 enum { 26 26 IO_SQ_THREAD_SHOULD_STOP = 0,
+5
io_uring/uring_cmd.c
··· 251 251 return -EOPNOTSUPP; 252 252 issue_flags |= IO_URING_F_IOPOLL; 253 253 req->iopoll_completed = 0; 254 + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { 255 + /* make sure every req only blocks once */ 256 + req->flags &= ~REQ_F_IOPOLL_STATE; 257 + req->iopoll_start = ktime_get_ns(); 258 + } 254 259 } 255 260 256 261 ret = file->f_op->uring_cmd(ioucmd, issue_flags);