Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-6.13-20242901' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:

- Remove a leftover struct from when the cqwait registered waiting was
transitioned to regions.

- Fix for an issue introduced in this merge window, where nop->fd might
be used uninitialized. Ensure it's always set.

- Add capping of the task_work run in local task_work mode, to prevent
bursty and long chains from adding too much latency.

- Work around xa_store() leaving ->head non-NULL if it encounters an
allocation error during storing. Just a debug trigger, and can go
away once xa_store() behaves in a more expected way for this
condition. Not a major thing as it basically requires fault injection
to trigger it.

- Fix a few mapping corner cases

- Fix KCSAN complaint on reading the table size post unlock. Again not
a "real" issue, but it's easy to silence by just keeping the reading
inside the lock that protects it.

* tag 'io_uring-6.13-20242901' of git://git.kernel.dk/linux:
io_uring/tctx: work around xa_store() allocation error issue
io_uring: fix corner case forgetting to vunmap
io_uring: fix task_work cap overshooting
io_uring: check for overflows in io_pin_pages
io_uring/nop: ensure nop->fd is always initialized
io_uring: limit local tw done
io_uring: add io_local_work_pending()
io_uring/region: return negative -E2BIG in io_create_region()
io_uring: protect register tracing
io_uring: remove io_uring_cqwait_reg_arg

+87 -47
+1
include/linux/io_uring_types.h
··· 336 336 */ 337 337 struct { 338 338 struct llist_head work_llist; 339 + struct llist_head retry_llist; 339 340 unsigned long check_cq; 340 341 atomic_t cq_wait_nr; 341 342 atomic_t cq_timeouts;
-14
include/uapi/linux/io_uring.h
··· 874 874 }; 875 875 876 876 /* 877 - * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of 878 - * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is 879 - * called rather than pass in a wait argument structure separately. 880 - */ 881 - struct io_uring_cqwait_reg_arg { 882 - __u32 flags; 883 - __u32 struct_size; 884 - __u32 nr_entries; 885 - __u32 pad; 886 - __u64 user_addr; 887 - __u64 pad2[3]; 888 - }; 889 - 890 - /* 891 877 * Argument for io_uring_enter(2) with 892 878 * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument 893 879 * is an index into a previously registered fixed wait region described by
+50 -25
io_uring/io_uring.c
··· 121 121 122 122 #define IO_COMPL_BATCH 32 123 123 #define IO_REQ_ALLOC_BATCH 8 124 + #define IO_LOCAL_TW_DEFAULT_MAX 20 124 125 125 126 struct io_defer_entry { 126 127 struct list_head list; ··· 1256 1255 struct llist_node *node = llist_del_all(&ctx->work_llist); 1257 1256 1258 1257 __io_fallback_tw(node, false); 1258 + node = llist_del_all(&ctx->retry_llist); 1259 + __io_fallback_tw(node, false); 1259 1260 } 1260 1261 1261 1262 static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 1262 1263 int min_events) 1263 1264 { 1264 - if (llist_empty(&ctx->work_llist)) 1265 + if (!io_local_work_pending(ctx)) 1265 1266 return false; 1266 1267 if (events < min_events) 1267 1268 return true; ··· 1272 1269 return false; 1273 1270 } 1274 1271 1272 + static int __io_run_local_work_loop(struct llist_node **node, 1273 + struct io_tw_state *ts, 1274 + int events) 1275 + { 1276 + int ret = 0; 1277 + 1278 + while (*node) { 1279 + struct llist_node *next = (*node)->next; 1280 + struct io_kiocb *req = container_of(*node, struct io_kiocb, 1281 + io_task_work.node); 1282 + INDIRECT_CALL_2(req->io_task_work.func, 1283 + io_poll_task_func, io_req_rw_complete, 1284 + req, ts); 1285 + *node = next; 1286 + if (++ret >= events) 1287 + break; 1288 + } 1289 + 1290 + return ret; 1291 + } 1292 + 1275 1293 static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, 1276 - int min_events) 1294 + int min_events, int max_events) 1277 1295 { 1278 1296 struct llist_node *node; 1279 1297 unsigned int loops = 0; ··· 1305 1281 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1306 1282 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1307 1283 again: 1284 + min_events -= ret; 1285 + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); 1286 + if (ctx->retry_llist.first) 1287 + goto retry_done; 1288 + 1308 1289 /* 1309 1290 * llists are in reverse order, flip it back the right way before 1310 1291 * running the pending items. 1311 1292 */ 1312 1293 node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 1313 - while (node) { 1314 - struct llist_node *next = node->next; 1315 - struct io_kiocb *req = container_of(node, struct io_kiocb, 1316 - io_task_work.node); 1317 - INDIRECT_CALL_2(req->io_task_work.func, 1318 - io_poll_task_func, io_req_rw_complete, 1319 - req, ts); 1320 - ret++; 1321 - node = next; 1322 - } 1294 + ret += __io_run_local_work_loop(&node, ts, max_events - ret); 1295 + ctx->retry_llist.first = node; 1323 1296 loops++; 1324 1297 1325 1298 if (io_run_local_work_continue(ctx, ret, min_events)) 1326 1299 goto again; 1300 + retry_done: 1327 1301 io_submit_flush_completions(ctx); 1328 1302 if (io_run_local_work_continue(ctx, ret, min_events)) 1329 1303 goto again; ··· 1335 1313 { 1336 1314 struct io_tw_state ts = {}; 1337 1315 1338 - if (llist_empty(&ctx->work_llist)) 1316 + if (!io_local_work_pending(ctx)) 1339 1317 return 0; 1340 - return __io_run_local_work(ctx, &ts, min_events); 1318 + return __io_run_local_work(ctx, &ts, min_events, 1319 + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1341 1320 } 1342 1321 1343 - static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) 1322 + static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, 1323 + int max_events) 1344 1324 { 1345 1325 struct io_tw_state ts = {}; 1346 1326 int ret; 1347 1327 1348 1328 mutex_lock(&ctx->uring_lock); 1349 - ret = __io_run_local_work(ctx, &ts, min_events); 1329 + ret = __io_run_local_work(ctx, &ts, min_events, max_events); 1350 1330 mutex_unlock(&ctx->uring_lock); 1351 1331 return ret; 1352 1332 } ··· 2352 2328 2353 2329 int io_run_task_work_sig(struct io_ring_ctx *ctx) 2354 2330 { 2355 - if (!llist_empty(&ctx->work_llist)) { 2331 + if (io_local_work_pending(ctx)) { 2356 2332 __set_current_state(TASK_RUNNING); 2357 - if (io_run_local_work(ctx, INT_MAX) > 0) 2333 + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) 2358 2334 return 0; 2359 2335 } 2360 2336 if (io_run_task_work() > 0) ··· 2483 2459 { 2484 2460 if (unlikely(READ_ONCE(ctx->check_cq))) 2485 2461 return 1; 2486 - if (unlikely(!llist_empty(&ctx->work_llist))) 2462 + if (unlikely(io_local_work_pending(ctx))) 2487 2463 return 1; 2488 2464 if (unlikely(task_work_pending(current))) 2489 2465 return 1; ··· 2517 2493 2518 2494 if (!io_allowed_run_tw(ctx)) 2519 2495 return -EEXIST; 2520 - if (!llist_empty(&ctx->work_llist)) 2521 - io_run_local_work(ctx, min_events); 2496 + if (io_local_work_pending(ctx)) 2497 + io_run_local_work(ctx, min_events, 2498 + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 2522 2499 io_run_task_work(); 2523 2500 2524 2501 if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) ··· 2589 2564 * If we got woken because of task_work being processed, run it 2590 2565 * now rather than let the caller do another wait loop. 2591 2566 */ 2592 - if (!llist_empty(&ctx->work_llist)) 2593 - io_run_local_work(ctx, nr_wait); 2567 + if (io_local_work_pending(ctx)) 2568 + io_run_local_work(ctx, nr_wait, nr_wait); 2594 2569 io_run_task_work(); 2595 2570 2596 2571 /* ··· 3102 3077 3103 3078 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3104 3079 io_allowed_defer_tw_run(ctx)) 3105 - ret |= io_run_local_work(ctx, INT_MAX) > 0; 3080 + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; 3106 3081 ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 3107 3082 mutex_lock(&ctx->uring_lock); 3108 3083 ret |= io_poll_remove_all(ctx, tctx, cancel_all); ··· 3183 3158 io_run_task_work(); 3184 3159 io_uring_drop_tctx_refs(current); 3185 3160 xa_for_each(&tctx->xa, index, node) { 3186 - if (!llist_empty(&node->ctx->work_llist)) { 3161 + if (io_local_work_pending(node->ctx)) { 3187 3162 WARN_ON_ONCE(node->ctx->submitter_task && 3188 3163 node->ctx->submitter_task != current); 3189 3164 goto end_wait;
+7 -2
io_uring/io_uring.h
··· 347 347 return ret; 348 348 } 349 349 350 + static inline bool io_local_work_pending(struct io_ring_ctx *ctx) 351 + { 352 + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); 353 + } 354 + 350 355 static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 351 356 { 352 - return task_work_pending(current) || !llist_empty(&ctx->work_llist); 357 + return task_work_pending(current) || io_local_work_pending(ctx); 353 358 } 354 359 355 360 static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) ··· 489 484 static inline bool io_has_work(struct io_ring_ctx *ctx) 490 485 { 491 486 return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || 492 - !llist_empty(&ctx->work_llist); 487 + io_local_work_pending(ctx); 493 488 } 494 489 #endif
+10 -3
io_uring/memmap.c
··· 73 73 ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); 74 74 if (!IS_ERR(ret)) 75 75 goto done; 76 + if (nr_pages == 1) 77 + goto fail; 76 78 77 79 ret = io_mem_alloc_single(pages, nr_pages, size, gfp); 78 80 if (!IS_ERR(ret)) { ··· 83 81 *npages = nr_pages; 84 82 return ret; 85 83 } 86 - 84 + fail: 87 85 kvfree(pages); 88 86 *out_pages = NULL; 89 87 *npages = 0; ··· 138 136 struct page **pages; 139 137 int ret; 140 138 141 - end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 139 + if (check_add_overflow(uaddr, len, &end)) 140 + return ERR_PTR(-EOVERFLOW); 141 + if (check_add_overflow(end, PAGE_SIZE - 1, &end)) 142 + return ERR_PTR(-EOVERFLOW); 143 + 144 + end = end >> PAGE_SHIFT; 142 145 start = uaddr >> PAGE_SHIFT; 143 146 nr_pages = end - start; 144 147 if (WARN_ON_ONCE(!nr_pages)) ··· 236 229 if (!reg->size || reg->mmap_offset || reg->id) 237 230 return -EINVAL; 238 231 if ((reg->size >> PAGE_SHIFT) > INT_MAX) 239 - return E2BIG; 232 + return -E2BIG; 240 233 if ((reg->user_addr | reg->size) & ~PAGE_MASK) 241 234 return -EINVAL; 242 235 if (check_add_overflow(reg->user_addr, reg->size, &end))
+5 -1
io_uring/nop.c
··· 35 35 nop->result = READ_ONCE(sqe->len); 36 36 else 37 37 nop->result = 0; 38 - if (nop->flags & IORING_NOP_FIXED_FILE) 38 + if (nop->flags & IORING_NOP_FILE) 39 39 nop->fd = READ_ONCE(sqe->fd); 40 + else 41 + nop->fd = -1; 40 42 if (nop->flags & IORING_NOP_FIXED_BUFFER) 41 43 nop->buffer = READ_ONCE(sqe->buf_index); 44 + else 45 + nop->buffer = -1; 42 46 return 0; 43 47 } 44 48
+2 -1
io_uring/register.c
··· 905 905 906 906 mutex_lock(&ctx->uring_lock); 907 907 ret = __io_uring_register(ctx, opcode, arg, nr_args); 908 - mutex_unlock(&ctx->uring_lock); 908 + 909 909 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 910 910 ctx->buf_table.nr, ret); 911 + mutex_unlock(&ctx->uring_lock); 911 912 if (!use_registered_ring) 912 913 fput(file); 913 914 return ret;
+12 -1
io_uring/tctx.c
··· 47 47 void __io_uring_free(struct task_struct *tsk) 48 48 { 49 49 struct io_uring_task *tctx = tsk->io_uring; 50 + struct io_tctx_node *node; 51 + unsigned long index; 50 52 51 - WARN_ON_ONCE(!xa_empty(&tctx->xa)); 53 + /* 54 + * Fault injection forcing allocation errors in the xa_store() path 55 + * can lead to xa_empty() returning false, even though no actual 56 + * node is stored in the xarray. Until that gets sorted out, attempt 57 + * an iteration here and warn if any entries are found. 58 + */ 59 + xa_for_each(&tctx->xa, index, node) { 60 + WARN_ON_ONCE(1); 61 + break; 62 + } 52 63 WARN_ON_ONCE(tctx->io_wq); 53 64 WARN_ON_ONCE(tctx->cached_refs); 54 65