Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add support for hybrid IOPOLL

A new hybrid poll is implemented on the io_uring layer. Once an IO is
issued, it will not poll immediately, but rather block first and re-run
before IO complete, then poll to reap IO. While this poll method could
be a suboptimal solution when running on a single thread, it offers
performance lower than regular polling but higher than IRQ, and CPU
utilization is also lower than polling.

To use hybrid polling, the ring must be setup with both the
IORING_SETUP_IOPOLL and IORING_SETUP_HYBRID)IOPOLL flags set. Hybrid
polling has the same restrictions as IOPOLL, in that commands must
explicitly support it.

Signed-off-by: hexue <xue01.he@samsung.com>
Link: https://lore.kernel.org/r/20241101091957.564220-2-xue01.he@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

hexue and committed by
Jens Axboe
01ee194d c1329532

+108 -14
+17 -2
include/linux/io_uring_types.h
··· 298 298 * ->uring_cmd() by io_uring_cmd_insert_cancelable() 299 299 */ 300 300 struct hlist_head cancelable_uring_cmd; 301 + /* 302 + * For Hybrid IOPOLL, runtime in hybrid polling, without 303 + * scheduling time 304 + */ 305 + u64 hybrid_poll_time; 301 306 } ____cacheline_aligned_in_smp; 302 307 303 308 struct { ··· 454 449 REQ_F_LINK_TIMEOUT_BIT, 455 450 REQ_F_NEED_CLEANUP_BIT, 456 451 REQ_F_POLLED_BIT, 452 + REQ_F_HYBRID_IOPOLL_STATE_BIT, 457 453 REQ_F_BUFFER_SELECTED_BIT, 458 454 REQ_F_BUFFER_RING_BIT, 459 455 REQ_F_REISSUE_BIT, ··· 513 507 REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), 514 508 /* already went through poll handler */ 515 509 REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), 510 + /* every req only blocks once in hybrid poll */ 511 + REQ_F_IOPOLL_STATE = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT), 516 512 /* buffer already selected */ 517 513 REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), 518 514 /* buffer selected from ring, needs commit */ ··· 647 639 atomic_t refs; 648 640 bool cancel_seq_set; 649 641 struct io_task_work io_task_work; 650 - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 651 - struct hlist_node hash_node; 642 + union { 643 + /* 644 + * for polled requests, i.e. IORING_OP_POLL_ADD and async armed 645 + * poll 646 + */ 647 + struct hlist_node hash_node; 648 + /* For IOPOLL setup queues, with hybrid polling */ 649 + u64 iopoll_start; 650 + }; 652 651 /* internal polling, see IORING_FEAT_FAST_POLL */ 653 652 struct async_poll *apoll; 654 653 /* opcode allocated if it needs to store data for async defer */
+3
include/uapi/linux/io_uring.h
··· 200 200 */ 201 201 #define IORING_SETUP_NO_SQARRAY (1U << 16) 202 202 203 + /* Use hybrid poll in iopoll process */ 204 + #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) 205 + 203 206 enum io_uring_op { 204 207 IORING_OP_NOP, 205 208 IORING_OP_READV,
+7 -1
io_uring/io_uring.c
··· 307 307 goto err; 308 308 309 309 ctx->flags = p->flags; 310 + ctx->hybrid_poll_time = LLONG_MAX; 310 311 atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 311 312 init_waitqueue_head(&ctx->sqo_sq_wait); 312 313 INIT_LIST_HEAD(&ctx->sqd_list); ··· 3631 3630 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3632 3631 static_branch_inc(&io_key_has_sqarray); 3633 3632 3633 + /* HYBRID_IOPOLL only valid with IOPOLL */ 3634 + if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == 3635 + IORING_SETUP_HYBRID_IOPOLL) 3636 + return -EINVAL; 3637 + 3634 3638 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3635 3639 !(ctx->flags & IORING_SETUP_IOPOLL) && 3636 3640 !(ctx->flags & IORING_SETUP_SQPOLL)) ··· 3791 3785 IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 3792 3786 IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 3793 3787 IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 3794 - IORING_SETUP_NO_SQARRAY)) 3788 + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) 3795 3789 return -EINVAL; 3796 3790 3797 3791 return io_uring_create(entries, &p, params);
+81 -11
io_uring/rw.c
··· 817 817 kiocb->ki_flags |= IOCB_HIPRI; 818 818 kiocb->ki_complete = io_complete_rw_iopoll; 819 819 req->iopoll_completed = 0; 820 + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { 821 + /* make sure every req only blocks once*/ 822 + req->flags &= ~REQ_F_IOPOLL_STATE; 823 + req->iopoll_start = ktime_get_ns(); 824 + } 820 825 } else { 821 826 if (kiocb->ki_flags & IOCB_HIPRI) 822 827 return -EINVAL; ··· 1120 1115 io_req_set_res(req, res, req->cqe.flags); 1121 1116 } 1122 1117 1118 + static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, 1119 + unsigned int poll_flags) 1120 + { 1121 + struct file *file = req->file; 1122 + 1123 + if (req->opcode == IORING_OP_URING_CMD) { 1124 + struct io_uring_cmd *ioucmd; 1125 + 1126 + ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 1127 + return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); 1128 + } else { 1129 + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1130 + 1131 + return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); 1132 + } 1133 + } 1134 + 1135 + static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) 1136 + { 1137 + struct hrtimer_sleeper timer; 1138 + enum hrtimer_mode mode; 1139 + ktime_t kt; 1140 + u64 sleep_time; 1141 + 1142 + if (req->flags & REQ_F_IOPOLL_STATE) 1143 + return 0; 1144 + 1145 + if (ctx->hybrid_poll_time == LLONG_MAX) 1146 + return 0; 1147 + 1148 + /* Using half the running time to do schedule */ 1149 + sleep_time = ctx->hybrid_poll_time / 2; 1150 + 1151 + kt = ktime_set(0, sleep_time); 1152 + req->flags |= REQ_F_IOPOLL_STATE; 1153 + 1154 + mode = HRTIMER_MODE_REL; 1155 + hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); 1156 + hrtimer_set_expires(&timer.timer, kt); 1157 + set_current_state(TASK_INTERRUPTIBLE); 1158 + hrtimer_sleeper_start_expires(&timer, mode); 1159 + 1160 + if (timer.task) 1161 + io_schedule(); 1162 + 1163 + hrtimer_cancel(&timer.timer); 1164 + __set_current_state(TASK_RUNNING); 1165 + destroy_hrtimer_on_stack(&timer.timer); 1166 + return sleep_time; 1167 + } 1168 + 1169 + static int io_uring_hybrid_poll(struct io_kiocb *req, 1170 + struct io_comp_batch *iob, unsigned int poll_flags) 1171 + { 1172 + struct io_ring_ctx *ctx = req->ctx; 1173 + u64 runtime, sleep_time; 1174 + int ret; 1175 + 1176 + sleep_time = io_hybrid_iopoll_delay(ctx, req); 1177 + ret = io_uring_classic_poll(req, iob, poll_flags); 1178 + runtime = ktime_get_ns() - req->iopoll_start - sleep_time; 1179 + 1180 + /* 1181 + * Use minimum sleep time if we're polling devices with different 1182 + * latencies. We could get more completions from the faster ones. 1183 + */ 1184 + if (ctx->hybrid_poll_time > runtime) 1185 + ctx->hybrid_poll_time = runtime; 1186 + 1187 + return ret; 1188 + } 1189 + 1123 1190 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 1124 1191 { 1125 1192 struct io_wq_work_node *pos, *start, *prev; ··· 1208 1131 1209 1132 wq_list_for_each(pos, start, &ctx->iopoll_list) { 1210 1133 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 1211 - struct file *file = req->file; 1212 1134 int ret; 1213 1135 1214 1136 /* ··· 1218 1142 if (READ_ONCE(req->iopoll_completed)) 1219 1143 break; 1220 1144 1221 - if (req->opcode == IORING_OP_URING_CMD) { 1222 - struct io_uring_cmd *ioucmd; 1145 + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) 1146 + ret = io_uring_hybrid_poll(req, &iob, poll_flags); 1147 + else 1148 + ret = io_uring_classic_poll(req, &iob, poll_flags); 1223 1149 1224 - ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 1225 - ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob, 1226 - poll_flags); 1227 - } else { 1228 - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1229 - 1230 - ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags); 1231 - } 1232 1150 if (unlikely(ret < 0)) 1233 1151 return ret; 1234 1152 else if (ret)