Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: remove unconditional looping in local task_work handling

If we have a ton of notifications coming in, we can be looping in here
for a long time. This can be problematic for various reasons, mostly
because we can starve userspace. If the application is waiting on N
events, then only re-run if we need more events.

Fixes: c0e0d6ba25f1 ("io_uring: add IORING_SETUP_DEFER_TASKRUN")
Signed-off-by: Jens Axboe <axboe@kernel.dk>

+29 -15
+29 -15
io_uring/io_uring.c
··· 1386 1386 } 1387 1387 } 1388 1388 1389 - static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) 1389 + static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 1390 + int min_events) 1391 + { 1392 + if (llist_empty(&ctx->work_llist)) 1393 + return false; 1394 + if (events < min_events) 1395 + return true; 1396 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1397 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1398 + return false; 1399 + } 1400 + 1401 + static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, 1402 + int min_events) 1390 1403 { 1391 1404 struct llist_node *node; 1392 1405 unsigned int loops = 0; ··· 1427 1414 } 1428 1415 loops++; 1429 1416 1430 - if (!llist_empty(&ctx->work_llist)) 1417 + if (io_run_local_work_continue(ctx, ret, min_events)) 1431 1418 goto again; 1432 1419 if (ts->locked) { 1433 1420 io_submit_flush_completions(ctx); 1434 - if (!llist_empty(&ctx->work_llist)) 1421 + if (io_run_local_work_continue(ctx, ret, min_events)) 1435 1422 goto again; 1436 1423 } 1424 + 1437 1425 trace_io_uring_local_work_run(ctx, ret, loops); 1438 1426 return ret; 1439 1427 } 1440 1428 1441 - static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) 1429 + static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, 1430 + int min_events) 1442 1431 { 1443 1432 struct io_tw_state ts = { .locked = true, }; 1444 1433 int ret; ··· 1448 1433 if (llist_empty(&ctx->work_llist)) 1449 1434 return 0; 1450 1435 1451 - ret = __io_run_local_work(ctx, &ts); 1436 + ret = __io_run_local_work(ctx, &ts, min_events); 1452 1437 /* shouldn't happen! */ 1453 1438 if (WARN_ON_ONCE(!ts.locked)) 1454 1439 mutex_lock(&ctx->uring_lock); 1455 1440 return ret; 1456 1441 } 1457 1442 1458 - static int io_run_local_work(struct io_ring_ctx *ctx) 1443 + static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) 1459 1444 { 1460 1445 struct io_tw_state ts = {}; 1461 1446 int ret; 1462 1447 1463 1448 ts.locked = mutex_trylock(&ctx->uring_lock); 1464 - ret = __io_run_local_work(ctx, &ts); 1449 + ret = __io_run_local_work(ctx, &ts, min_events); 1465 1450 if (ts.locked) 1466 1451 mutex_unlock(&ctx->uring_lock); 1467 1452 ··· 1657 1642 io_task_work_pending(ctx)) { 1658 1643 u32 tail = ctx->cached_cq_tail; 1659 1644 1660 - (void) io_run_local_work_locked(ctx); 1645 + (void) io_run_local_work_locked(ctx, min); 1661 1646 1662 1647 if (task_work_pending(current) || 1663 1648 wq_list_empty(&ctx->iopoll_list)) { ··· 2501 2486 { 2502 2487 if (!llist_empty(&ctx->work_llist)) { 2503 2488 __set_current_state(TASK_RUNNING); 2504 - if (io_run_local_work(ctx) > 0) 2489 + if (io_run_local_work(ctx, INT_MAX) > 0) 2505 2490 return 0; 2506 2491 } 2507 2492 if (io_run_task_work() > 0) ··· 2569 2554 if (!io_allowed_run_tw(ctx)) 2570 2555 return -EEXIST; 2571 2556 if (!llist_empty(&ctx->work_llist)) 2572 - io_run_local_work(ctx); 2557 + io_run_local_work(ctx, min_events); 2573 2558 io_run_task_work(); 2574 2559 io_cqring_overflow_flush(ctx); 2575 2560 /* if user messes with these they will just get an early return */ ··· 2607 2592 2608 2593 trace_io_uring_cqring_wait(ctx, min_events); 2609 2594 do { 2595 + int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); 2610 2596 unsigned long check_cq; 2611 2597 2612 2598 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2613 - int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); 2614 - 2615 2599 atomic_set(&ctx->cq_wait_nr, nr_wait); 2616 2600 set_current_state(TASK_INTERRUPTIBLE); 2617 2601 } else { ··· 2629 2615 */ 2630 2616 io_run_task_work(); 2631 2617 if (!llist_empty(&ctx->work_llist)) 2632 - io_run_local_work(ctx); 2618 + io_run_local_work(ctx, nr_wait); 2633 2619 2634 2620 /* 2635 2621 * Non-local task_work will be run on exit to userspace, but ··· 3284 3270 3285 3271 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3286 3272 io_allowed_defer_tw_run(ctx)) 3287 - ret |= io_run_local_work(ctx) > 0; 3273 + ret |= io_run_local_work(ctx, INT_MAX) > 0; 3288 3274 ret |= io_cancel_defer_files(ctx, task, cancel_all); 3289 3275 mutex_lock(&ctx->uring_lock); 3290 3276 ret |= io_poll_remove_all(ctx, task, cancel_all); ··· 3646 3632 * it should handle ownership problems if any. 3647 3633 */ 3648 3634 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 3649 - (void)io_run_local_work_locked(ctx); 3635 + (void)io_run_local_work_locked(ctx, min_complete); 3650 3636 } 3651 3637 mutex_unlock(&ctx->uring_lock); 3652 3638 }