io_uring: split out task work code into tw.c

+2 -1

io_uring/Makefile

··· 8 8 9 9 obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ 10 10 tctx.o filetable.o rw.o poll.o \ 11 - eventfd.o uring_cmd.o openclose.o \ 11 + tw.o eventfd.o uring_cmd.o openclose.o \ 12 12 sqpoll.o xattr.o nop.o fs.o splice.o \ 13 13 sync.o msg_ring.o advise.o openclose.o \ 14 14 statx.o timeout.o cancel.o \ 15 15 waitid.o register.o truncate.o \ 16 16 memmap.o alloc_cache.o query.o 17 + 17 18 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 18 19 obj-$(CONFIG_IO_WQ) += io-wq.o 19 20 obj-$(CONFIG_FUTEX) += futex.o

-371

io_uring/io_uring.c

··· 110 110 111 111 #define IO_COMPL_BATCH 32 112 112 #define IO_REQ_ALLOC_BATCH 8 113 - #define IO_LOCAL_TW_DEFAULT_MAX 20 114 113 115 114 /* requests with any of those set should undergo io_disarm_next() */ 116 115 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 117 - 118 - /* 119 - * No waiters. It's larger than any valid value of the tw counter 120 - * so that tests against ->cq_wait_nr would fail and skip wake_up(). 121 - */ 122 - #define IO_CQ_WAKE_INIT (-1U) 123 - /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 124 - #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 125 116 126 117 static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); 127 118 static void __io_req_caches_free(struct io_ring_ctx *ctx); ··· 194 203 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 195 204 196 205 complete(&ctx->ref_comp); 197 - } 198 - 199 - /* 200 - * Terminate the request if either of these conditions are true: 201 - * 202 - * 1) It's being executed by the original task, but that task is marked 203 - * with PF_EXITING as it's exiting. 204 - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 205 - * our fallback task_work. 206 - * 3) The ring has been closed and is going away. 207 - */ 208 - static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 209 - { 210 - return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); 211 - } 212 - 213 - static __cold void io_fallback_req_func(struct work_struct *work) 214 - { 215 - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 216 - fallback_work.work); 217 - struct llist_node *node = llist_del_all(&ctx->fallback_llist); 218 - struct io_kiocb *req, *tmp; 219 - struct io_tw_state ts = {}; 220 - 221 - percpu_ref_get(&ctx->refs); 222 - mutex_lock(&ctx->uring_lock); 223 - ts.cancel = io_should_terminate_tw(ctx); 224 - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 225 - req->io_task_work.func((struct io_tw_req){req}, ts); 226 - io_submit_flush_completions(ctx); 227 - mutex_unlock(&ctx->uring_lock); 228 - percpu_ref_put(&ctx->refs); 229 206 } 230 207 231 208 static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) ··· 1027 1068 nxt = req->link; 1028 1069 req->link = NULL; 1029 1070 return nxt; 1030 - } 1031 - 1032 - static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) 1033 - { 1034 - if (!ctx) 1035 - return; 1036 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1037 - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1038 - 1039 - io_submit_flush_completions(ctx); 1040 - mutex_unlock(&ctx->uring_lock); 1041 - percpu_ref_put(&ctx->refs); 1042 - } 1043 - 1044 - /* 1045 - * Run queued task_work, returning the number of entries processed in *count. 1046 - * If more entries than max_entries are available, stop processing once this 1047 - * is reached and return the rest of the list. 1048 - */ 1049 - struct llist_node *io_handle_tw_list(struct llist_node *node, 1050 - unsigned int *count, 1051 - unsigned int max_entries) 1052 - { 1053 - struct io_ring_ctx *ctx = NULL; 1054 - struct io_tw_state ts = { }; 1055 - 1056 - do { 1057 - struct llist_node *next = node->next; 1058 - struct io_kiocb *req = container_of(node, struct io_kiocb, 1059 - io_task_work.node); 1060 - 1061 - if (req->ctx != ctx) { 1062 - ctx_flush_and_put(ctx, ts); 1063 - ctx = req->ctx; 1064 - mutex_lock(&ctx->uring_lock); 1065 - percpu_ref_get(&ctx->refs); 1066 - ts.cancel = io_should_terminate_tw(ctx); 1067 - } 1068 - INDIRECT_CALL_2(req->io_task_work.func, 1069 - io_poll_task_func, io_req_rw_complete, 1070 - (struct io_tw_req){req}, ts); 1071 - node = next; 1072 - (*count)++; 1073 - if (unlikely(need_resched())) { 1074 - ctx_flush_and_put(ctx, ts); 1075 - ctx = NULL; 1076 - cond_resched(); 1077 - } 1078 - } while (node && *count < max_entries); 1079 - 1080 - ctx_flush_and_put(ctx, ts); 1081 - return node; 1082 - } 1083 - 1084 - static __cold void __io_fallback_tw(struct llist_node *node, bool sync) 1085 - { 1086 - struct io_ring_ctx *last_ctx = NULL; 1087 - struct io_kiocb *req; 1088 - 1089 - while (node) { 1090 - req = container_of(node, struct io_kiocb, io_task_work.node); 1091 - node = node->next; 1092 - if (last_ctx != req->ctx) { 1093 - if (last_ctx) { 1094 - if (sync) 1095 - flush_delayed_work(&last_ctx->fallback_work); 1096 - percpu_ref_put(&last_ctx->refs); 1097 - } 1098 - last_ctx = req->ctx; 1099 - percpu_ref_get(&last_ctx->refs); 1100 - } 1101 - if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) 1102 - schedule_delayed_work(&last_ctx->fallback_work, 1); 1103 - } 1104 - 1105 - if (last_ctx) { 1106 - if (sync) 1107 - flush_delayed_work(&last_ctx->fallback_work); 1108 - percpu_ref_put(&last_ctx->refs); 1109 - } 1110 - } 1111 - 1112 - static void io_fallback_tw(struct io_uring_task *tctx, bool sync) 1113 - { 1114 - struct llist_node *node = llist_del_all(&tctx->task_list); 1115 - 1116 - __io_fallback_tw(node, sync); 1117 - } 1118 - 1119 - struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, 1120 - unsigned int max_entries, 1121 - unsigned int *count) 1122 - { 1123 - struct llist_node *node; 1124 - 1125 - node = llist_del_all(&tctx->task_list); 1126 - if (node) { 1127 - node = llist_reverse_order(node); 1128 - node = io_handle_tw_list(node, count, max_entries); 1129 - } 1130 - 1131 - /* relaxed read is enough as only the task itself sets ->in_cancel */ 1132 - if (unlikely(atomic_read(&tctx->in_cancel))) 1133 - io_uring_drop_tctx_refs(current); 1134 - 1135 - trace_io_uring_task_work_run(tctx, *count); 1136 - return node; 1137 - } 1138 - 1139 - void tctx_task_work(struct callback_head *cb) 1140 - { 1141 - struct io_uring_task *tctx; 1142 - struct llist_node *ret; 1143 - unsigned int count = 0; 1144 - 1145 - tctx = container_of(cb, struct io_uring_task, task_work); 1146 - ret = tctx_task_work_run(tctx, UINT_MAX, &count); 1147 - /* can't happen */ 1148 - WARN_ON_ONCE(ret); 1149 - } 1150 - 1151 - static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 1152 - { 1153 - struct io_ring_ctx *ctx = req->ctx; 1154 - unsigned nr_wait, nr_tw, nr_tw_prev; 1155 - struct llist_node *head; 1156 - 1157 - /* See comment above IO_CQ_WAKE_INIT */ 1158 - BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 1159 - 1160 - /* 1161 - * We don't know how many requests there are in the link and whether 1162 - * they can even be queued lazily, fall back to non-lazy. 1163 - */ 1164 - if (req->flags & IO_REQ_LINK_FLAGS) 1165 - flags &= ~IOU_F_TWQ_LAZY_WAKE; 1166 - 1167 - guard(rcu)(); 1168 - 1169 - head = READ_ONCE(ctx->work_llist.first); 1170 - do { 1171 - nr_tw_prev = 0; 1172 - if (head) { 1173 - struct io_kiocb *first_req = container_of(head, 1174 - struct io_kiocb, 1175 - io_task_work.node); 1176 - /* 1177 - * Might be executed at any moment, rely on 1178 - * SLAB_TYPESAFE_BY_RCU to keep it alive. 1179 - */ 1180 - nr_tw_prev = READ_ONCE(first_req->nr_tw); 1181 - } 1182 - 1183 - /* 1184 - * Theoretically, it can overflow, but that's fine as one of 1185 - * previous adds should've tried to wake the task. 1186 - */ 1187 - nr_tw = nr_tw_prev + 1; 1188 - if (!(flags & IOU_F_TWQ_LAZY_WAKE)) 1189 - nr_tw = IO_CQ_WAKE_FORCE; 1190 - 1191 - req->nr_tw = nr_tw; 1192 - req->io_task_work.node.next = head; 1193 - } while (!try_cmpxchg(&ctx->work_llist.first, &head, 1194 - &req->io_task_work.node)); 1195 - 1196 - /* 1197 - * cmpxchg implies a full barrier, which pairs with the barrier 1198 - * in set_current_state() on the io_cqring_wait() side. It's used 1199 - * to ensure that either we see updated ->cq_wait_nr, or waiters 1200 - * going to sleep will observe the work added to the list, which 1201 - * is similar to the wait/wawke task state sync. 1202 - */ 1203 - 1204 - if (!head) { 1205 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1206 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1207 - if (ctx->has_evfd) 1208 - io_eventfd_signal(ctx, false); 1209 - } 1210 - 1211 - nr_wait = atomic_read(&ctx->cq_wait_nr); 1212 - /* not enough or no one is waiting */ 1213 - if (nr_tw < nr_wait) 1214 - return; 1215 - /* the previous add has already woken it up */ 1216 - if (nr_tw_prev >= nr_wait) 1217 - return; 1218 - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 1219 - } 1220 - 1221 - static void io_req_normal_work_add(struct io_kiocb *req) 1222 - { 1223 - struct io_uring_task *tctx = req->tctx; 1224 - struct io_ring_ctx *ctx = req->ctx; 1225 - 1226 - /* task_work already pending, we're done */ 1227 - if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 1228 - return; 1229 - 1230 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1231 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1232 - 1233 - /* SQPOLL doesn't need the task_work added, it'll run it itself */ 1234 - if (ctx->flags & IORING_SETUP_SQPOLL) { 1235 - __set_notify_signal(tctx->task); 1236 - return; 1237 - } 1238 - 1239 - if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) 1240 - return; 1241 - 1242 - io_fallback_tw(tctx, false); 1243 - } 1244 - 1245 - void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) 1246 - { 1247 - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) 1248 - io_req_local_work_add(req, flags); 1249 - else 1250 - io_req_normal_work_add(req); 1251 - } 1252 - 1253 - void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) 1254 - { 1255 - if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) 1256 - return; 1257 - __io_req_task_work_add(req, flags); 1258 - } 1259 - 1260 - static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 1261 - { 1262 - struct llist_node *node = llist_del_all(&ctx->work_llist); 1263 - 1264 - __io_fallback_tw(node, false); 1265 - node = llist_del_all(&ctx->retry_llist); 1266 - __io_fallback_tw(node, false); 1267 - } 1268 - 1269 - static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 1270 - int min_events) 1271 - { 1272 - if (!io_local_work_pending(ctx)) 1273 - return false; 1274 - if (events < min_events) 1275 - return true; 1276 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1277 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1278 - return false; 1279 - } 1280 - 1281 - static int __io_run_local_work_loop(struct llist_node **node, 1282 - io_tw_token_t tw, 1283 - int events) 1284 - { 1285 - int ret = 0; 1286 - 1287 - while (*node) { 1288 - struct llist_node *next = (*node)->next; 1289 - struct io_kiocb *req = container_of(*node, struct io_kiocb, 1290 - io_task_work.node); 1291 - INDIRECT_CALL_2(req->io_task_work.func, 1292 - io_poll_task_func, io_req_rw_complete, 1293 - (struct io_tw_req){req}, tw); 1294 - *node = next; 1295 - if (++ret >= events) 1296 - break; 1297 - } 1298 - 1299 - return ret; 1300 - } 1301 - 1302 - static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, 1303 - int min_events, int max_events) 1304 - { 1305 - struct llist_node *node; 1306 - unsigned int loops = 0; 1307 - int ret = 0; 1308 - 1309 - if (WARN_ON_ONCE(ctx->submitter_task != current)) 1310 - return -EEXIST; 1311 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1312 - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1313 - again: 1314 - tw.cancel = io_should_terminate_tw(ctx); 1315 - min_events -= ret; 1316 - ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 1317 - if (ctx->retry_llist.first) 1318 - goto retry_done; 1319 - 1320 - /* 1321 - * llists are in reverse order, flip it back the right way before 1322 - * running the pending items. 1323 - */ 1324 - node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 1325 - ret += __io_run_local_work_loop(&node, tw, max_events - ret); 1326 - ctx->retry_llist.first = node; 1327 - loops++; 1328 - 1329 - if (io_run_local_work_continue(ctx, ret, min_events)) 1330 - goto again; 1331 - retry_done: 1332 - io_submit_flush_completions(ctx); 1333 - if (io_run_local_work_continue(ctx, ret, min_events)) 1334 - goto again; 1335 - 1336 - trace_io_uring_local_work_run(ctx, ret, loops); 1337 - return ret; 1338 - } 1339 - 1340 - static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, 1341 - int min_events) 1342 - { 1343 - struct io_tw_state ts = {}; 1344 - 1345 - if (!io_local_work_pending(ctx)) 1346 - return 0; 1347 - return __io_run_local_work(ctx, ts, min_events, 1348 - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1349 - } 1350 - 1351 - int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) 1352 - { 1353 - struct io_tw_state ts = {}; 1354 - int ret; 1355 - 1356 - mutex_lock(&ctx->uring_lock); 1357 - ret = __io_run_local_work(ctx, ts, min_events, max_events); 1358 - mutex_unlock(&ctx->uring_lock); 1359 - return ret; 1360 1071 } 1361 1072 1362 1073 static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw)

+3 -76

io_uring/io_uring.h

··· 10 10 #include "alloc_cache.h" 11 11 #include "io-wq.h" 12 12 #include "slist.h" 13 + #include "tw.h" 13 14 #include "opdef.h" 14 15 15 16 #ifndef CREATE_TRACE_POINTS ··· 89 88 IOSQE_BUFFER_SELECT |\ 90 89 IOSQE_CQE_SKIP_SUCCESS) 91 90 91 + #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 92 + 92 93 /* 93 94 * Complaint timeout for io_uring cancelation exits, and for io-wq exit 94 95 * worker waiting. ··· 159 156 int io_prepare_config(struct io_ctx_config *config); 160 157 161 158 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 162 - int io_run_task_work_sig(struct io_ring_ctx *ctx); 163 - int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); 164 159 void io_req_defer_failed(struct io_kiocb *req, s32 res); 165 160 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 166 161 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 172 171 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, 173 172 unsigned issue_flags); 174 173 175 - void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 176 - void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); 177 174 void io_req_task_queue(struct io_kiocb *req); 178 175 void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); 179 176 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 180 177 void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); 181 - struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 182 - struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 183 - void tctx_task_work(struct callback_head *cb); 184 178 __cold void io_uring_drop_tctx_refs(struct task_struct *task); 185 179 186 180 int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, ··· 226 230 static inline bool io_is_compat(struct io_ring_ctx *ctx) 227 231 { 228 232 return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); 229 - } 230 - 231 - static inline void io_req_task_work_add(struct io_kiocb *req) 232 - { 233 - __io_req_task_work_add(req, 0); 234 233 } 235 234 236 235 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) ··· 452 461 return min(entries, ctx->sq_entries); 453 462 } 454 463 455 - static inline int io_run_task_work(void) 456 - { 457 - bool ret = false; 458 - 459 - /* 460 - * Always check-and-clear the task_work notification signal. With how 461 - * signaling works for task_work, we can find it set with nothing to 462 - * run. We need to clear it for that case, like get_signal() does. 463 - */ 464 - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 465 - clear_notify_signal(); 466 - /* 467 - * PF_IO_WORKER never returns to userspace, so check here if we have 468 - * notify work that needs processing. 469 - */ 470 - if (current->flags & PF_IO_WORKER) { 471 - if (test_thread_flag(TIF_NOTIFY_RESUME)) { 472 - __set_current_state(TASK_RUNNING); 473 - resume_user_mode_work(NULL); 474 - } 475 - if (current->io_uring) { 476 - unsigned int count = 0; 477 - 478 - __set_current_state(TASK_RUNNING); 479 - tctx_task_work_run(current->io_uring, UINT_MAX, &count); 480 - if (count) 481 - ret = true; 482 - } 483 - } 484 - if (task_work_pending(current)) { 485 - __set_current_state(TASK_RUNNING); 486 - task_work_run(); 487 - ret = true; 488 - } 489 - 490 - return ret; 491 - } 492 - 493 - static inline bool io_local_work_pending(struct io_ring_ctx *ctx) 494 - { 495 - return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); 496 - } 497 - 498 - static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 499 - { 500 - return task_work_pending(current) || io_local_work_pending(ctx); 501 - } 502 - 503 - static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) 504 - { 505 - lockdep_assert_held(&ctx->uring_lock); 506 - } 507 - 508 464 /* 509 465 * Don't complete immediately but use deferred completion infrastructure. 510 466 * Protected by ->uring_lock and can only be used either with ··· 507 569 } 508 570 *req = io_extract_req(ctx); 509 571 return true; 510 - } 511 - 512 - static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) 513 - { 514 - return likely(ctx->submitter_task == current); 515 - } 516 - 517 - static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) 518 - { 519 - return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || 520 - ctx->submitter_task == current); 521 572 } 522 573 523 574 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)

+354

io_uring/tw.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Task work handling for io_uring 4 + */ 5 + #include <linux/kernel.h> 6 + #include <linux/errno.h> 7 + #include <linux/sched/signal.h> 8 + #include <linux/io_uring.h> 9 + #include <linux/indirect_call_wrapper.h> 10 + 11 + #include "io_uring.h" 12 + #include "tctx.h" 13 + #include "poll.h" 14 + #include "rw.h" 15 + #include "eventfd.h" 16 + 17 + void io_fallback_req_func(struct work_struct *work) 18 + { 19 + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 20 + fallback_work.work); 21 + struct llist_node *node = llist_del_all(&ctx->fallback_llist); 22 + struct io_kiocb *req, *tmp; 23 + struct io_tw_state ts = {}; 24 + 25 + percpu_ref_get(&ctx->refs); 26 + mutex_lock(&ctx->uring_lock); 27 + ts.cancel = io_should_terminate_tw(ctx); 28 + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 29 + req->io_task_work.func((struct io_tw_req){req}, ts); 30 + io_submit_flush_completions(ctx); 31 + mutex_unlock(&ctx->uring_lock); 32 + percpu_ref_put(&ctx->refs); 33 + } 34 + 35 + static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) 36 + { 37 + if (!ctx) 38 + return; 39 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 40 + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 41 + 42 + io_submit_flush_completions(ctx); 43 + mutex_unlock(&ctx->uring_lock); 44 + percpu_ref_put(&ctx->refs); 45 + } 46 + 47 + /* 48 + * Run queued task_work, returning the number of entries processed in *count. 49 + * If more entries than max_entries are available, stop processing once this 50 + * is reached and return the rest of the list. 51 + */ 52 + struct llist_node *io_handle_tw_list(struct llist_node *node, 53 + unsigned int *count, 54 + unsigned int max_entries) 55 + { 56 + struct io_ring_ctx *ctx = NULL; 57 + struct io_tw_state ts = { }; 58 + 59 + do { 60 + struct llist_node *next = node->next; 61 + struct io_kiocb *req = container_of(node, struct io_kiocb, 62 + io_task_work.node); 63 + 64 + if (req->ctx != ctx) { 65 + ctx_flush_and_put(ctx, ts); 66 + ctx = req->ctx; 67 + mutex_lock(&ctx->uring_lock); 68 + percpu_ref_get(&ctx->refs); 69 + ts.cancel = io_should_terminate_tw(ctx); 70 + } 71 + INDIRECT_CALL_2(req->io_task_work.func, 72 + io_poll_task_func, io_req_rw_complete, 73 + (struct io_tw_req){req}, ts); 74 + node = next; 75 + (*count)++; 76 + if (unlikely(need_resched())) { 77 + ctx_flush_and_put(ctx, ts); 78 + ctx = NULL; 79 + cond_resched(); 80 + } 81 + } while (node && *count < max_entries); 82 + 83 + ctx_flush_and_put(ctx, ts); 84 + return node; 85 + } 86 + 87 + static __cold void __io_fallback_tw(struct llist_node *node, bool sync) 88 + { 89 + struct io_ring_ctx *last_ctx = NULL; 90 + struct io_kiocb *req; 91 + 92 + while (node) { 93 + req = container_of(node, struct io_kiocb, io_task_work.node); 94 + node = node->next; 95 + if (last_ctx != req->ctx) { 96 + if (last_ctx) { 97 + if (sync) 98 + flush_delayed_work(&last_ctx->fallback_work); 99 + percpu_ref_put(&last_ctx->refs); 100 + } 101 + last_ctx = req->ctx; 102 + percpu_ref_get(&last_ctx->refs); 103 + } 104 + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) 105 + schedule_delayed_work(&last_ctx->fallback_work, 1); 106 + } 107 + 108 + if (last_ctx) { 109 + if (sync) 110 + flush_delayed_work(&last_ctx->fallback_work); 111 + percpu_ref_put(&last_ctx->refs); 112 + } 113 + } 114 + 115 + static void io_fallback_tw(struct io_uring_task *tctx, bool sync) 116 + { 117 + struct llist_node *node = llist_del_all(&tctx->task_list); 118 + 119 + __io_fallback_tw(node, sync); 120 + } 121 + 122 + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, 123 + unsigned int max_entries, 124 + unsigned int *count) 125 + { 126 + struct llist_node *node; 127 + 128 + node = llist_del_all(&tctx->task_list); 129 + if (node) { 130 + node = llist_reverse_order(node); 131 + node = io_handle_tw_list(node, count, max_entries); 132 + } 133 + 134 + /* relaxed read is enough as only the task itself sets ->in_cancel */ 135 + if (unlikely(atomic_read(&tctx->in_cancel))) 136 + io_uring_drop_tctx_refs(current); 137 + 138 + trace_io_uring_task_work_run(tctx, *count); 139 + return node; 140 + } 141 + 142 + void tctx_task_work(struct callback_head *cb) 143 + { 144 + struct io_uring_task *tctx; 145 + struct llist_node *ret; 146 + unsigned int count = 0; 147 + 148 + tctx = container_of(cb, struct io_uring_task, task_work); 149 + ret = tctx_task_work_run(tctx, UINT_MAX, &count); 150 + /* can't happen */ 151 + WARN_ON_ONCE(ret); 152 + } 153 + 154 + void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 155 + { 156 + struct io_ring_ctx *ctx = req->ctx; 157 + unsigned nr_wait, nr_tw, nr_tw_prev; 158 + struct llist_node *head; 159 + 160 + /* See comment above IO_CQ_WAKE_INIT */ 161 + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 162 + 163 + /* 164 + * We don't know how many requests there are in the link and whether 165 + * they can even be queued lazily, fall back to non-lazy. 166 + */ 167 + if (req->flags & IO_REQ_LINK_FLAGS) 168 + flags &= ~IOU_F_TWQ_LAZY_WAKE; 169 + 170 + guard(rcu)(); 171 + 172 + head = READ_ONCE(ctx->work_llist.first); 173 + do { 174 + nr_tw_prev = 0; 175 + if (head) { 176 + struct io_kiocb *first_req = container_of(head, 177 + struct io_kiocb, 178 + io_task_work.node); 179 + /* 180 + * Might be executed at any moment, rely on 181 + * SLAB_TYPESAFE_BY_RCU to keep it alive. 182 + */ 183 + nr_tw_prev = READ_ONCE(first_req->nr_tw); 184 + } 185 + 186 + /* 187 + * Theoretically, it can overflow, but that's fine as one of 188 + * previous adds should've tried to wake the task. 189 + */ 190 + nr_tw = nr_tw_prev + 1; 191 + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) 192 + nr_tw = IO_CQ_WAKE_FORCE; 193 + 194 + req->nr_tw = nr_tw; 195 + req->io_task_work.node.next = head; 196 + } while (!try_cmpxchg(&ctx->work_llist.first, &head, 197 + &req->io_task_work.node)); 198 + 199 + /* 200 + * cmpxchg implies a full barrier, which pairs with the barrier 201 + * in set_current_state() on the io_cqring_wait() side. It's used 202 + * to ensure that either we see updated ->cq_wait_nr, or waiters 203 + * going to sleep will observe the work added to the list, which 204 + * is similar to the wait/wawke task state sync. 205 + */ 206 + 207 + if (!head) { 208 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 209 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 210 + if (ctx->has_evfd) 211 + io_eventfd_signal(ctx, false); 212 + } 213 + 214 + nr_wait = atomic_read(&ctx->cq_wait_nr); 215 + /* not enough or no one is waiting */ 216 + if (nr_tw < nr_wait) 217 + return; 218 + /* the previous add has already woken it up */ 219 + if (nr_tw_prev >= nr_wait) 220 + return; 221 + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 222 + } 223 + 224 + void io_req_normal_work_add(struct io_kiocb *req) 225 + { 226 + struct io_uring_task *tctx = req->tctx; 227 + struct io_ring_ctx *ctx = req->ctx; 228 + 229 + /* task_work already pending, we're done */ 230 + if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 231 + return; 232 + 233 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 234 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 235 + 236 + /* SQPOLL doesn't need the task_work added, it'll run it itself */ 237 + if (ctx->flags & IORING_SETUP_SQPOLL) { 238 + __set_notify_signal(tctx->task); 239 + return; 240 + } 241 + 242 + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) 243 + return; 244 + 245 + io_fallback_tw(tctx, false); 246 + } 247 + 248 + void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) 249 + { 250 + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) 251 + return; 252 + __io_req_task_work_add(req, flags); 253 + } 254 + 255 + void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 256 + { 257 + struct llist_node *node = llist_del_all(&ctx->work_llist); 258 + 259 + __io_fallback_tw(node, false); 260 + node = llist_del_all(&ctx->retry_llist); 261 + __io_fallback_tw(node, false); 262 + } 263 + 264 + static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 265 + int min_events) 266 + { 267 + if (!io_local_work_pending(ctx)) 268 + return false; 269 + if (events < min_events) 270 + return true; 271 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 272 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 273 + return false; 274 + } 275 + 276 + static int __io_run_local_work_loop(struct llist_node **node, 277 + io_tw_token_t tw, 278 + int events) 279 + { 280 + int ret = 0; 281 + 282 + while (*node) { 283 + struct llist_node *next = (*node)->next; 284 + struct io_kiocb *req = container_of(*node, struct io_kiocb, 285 + io_task_work.node); 286 + INDIRECT_CALL_2(req->io_task_work.func, 287 + io_poll_task_func, io_req_rw_complete, 288 + (struct io_tw_req){req}, tw); 289 + *node = next; 290 + if (++ret >= events) 291 + break; 292 + } 293 + 294 + return ret; 295 + } 296 + 297 + static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, 298 + int min_events, int max_events) 299 + { 300 + struct llist_node *node; 301 + unsigned int loops = 0; 302 + int ret = 0; 303 + 304 + if (WARN_ON_ONCE(ctx->submitter_task != current)) 305 + return -EEXIST; 306 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 307 + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 308 + again: 309 + tw.cancel = io_should_terminate_tw(ctx); 310 + min_events -= ret; 311 + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 312 + if (ctx->retry_llist.first) 313 + goto retry_done; 314 + 315 + /* 316 + * llists are in reverse order, flip it back the right way before 317 + * running the pending items. 318 + */ 319 + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 320 + ret += __io_run_local_work_loop(&node, tw, max_events - ret); 321 + ctx->retry_llist.first = node; 322 + loops++; 323 + 324 + if (io_run_local_work_continue(ctx, ret, min_events)) 325 + goto again; 326 + retry_done: 327 + io_submit_flush_completions(ctx); 328 + if (io_run_local_work_continue(ctx, ret, min_events)) 329 + goto again; 330 + 331 + trace_io_uring_local_work_run(ctx, ret, loops); 332 + return ret; 333 + } 334 + 335 + int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) 336 + { 337 + struct io_tw_state ts = {}; 338 + 339 + if (!io_local_work_pending(ctx)) 340 + return 0; 341 + return __io_run_local_work(ctx, ts, min_events, 342 + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 343 + } 344 + 345 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) 346 + { 347 + struct io_tw_state ts = {}; 348 + int ret; 349 + 350 + mutex_lock(&ctx->uring_lock); 351 + ret = __io_run_local_work(ctx, ts, min_events, max_events); 352 + mutex_unlock(&ctx->uring_lock); 353 + return ret; 354 + }

+124

io_uring/tw.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IOU_TW_H 3 + #define IOU_TW_H 4 + 5 + #include <linux/sched.h> 6 + #include <linux/percpu-refcount.h> 7 + #include <linux/io_uring_types.h> 8 + 9 + #define IO_LOCAL_TW_DEFAULT_MAX 20 10 + 11 + /* 12 + * No waiters. It's larger than any valid value of the tw counter 13 + * so that tests against ->cq_wait_nr would fail and skip wake_up(). 14 + */ 15 + #define IO_CQ_WAKE_INIT (-1U) 16 + /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 17 + #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 18 + 19 + /* 20 + * Terminate the request if either of these conditions are true: 21 + * 22 + * 1) It's being executed by the original task, but that task is marked 23 + * with PF_EXITING as it's exiting. 24 + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 25 + * our fallback task_work. 26 + * 3) The ring has been closed and is going away. 27 + */ 28 + static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 29 + { 30 + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); 31 + } 32 + 33 + void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); 34 + struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 35 + void tctx_task_work(struct callback_head *cb); 36 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); 37 + int io_run_task_work_sig(struct io_ring_ctx *ctx); 38 + 39 + __cold void io_fallback_req_func(struct work_struct *work); 40 + __cold void io_move_task_work_from_local(struct io_ring_ctx *ctx); 41 + int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events); 42 + 43 + void io_req_local_work_add(struct io_kiocb *req, unsigned flags); 44 + void io_req_normal_work_add(struct io_kiocb *req); 45 + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 46 + 47 + static inline void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) 48 + { 49 + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) 50 + io_req_local_work_add(req, flags); 51 + else 52 + io_req_normal_work_add(req); 53 + } 54 + 55 + static inline void io_req_task_work_add(struct io_kiocb *req) 56 + { 57 + __io_req_task_work_add(req, 0); 58 + } 59 + 60 + static inline int io_run_task_work(void) 61 + { 62 + bool ret = false; 63 + 64 + /* 65 + * Always check-and-clear the task_work notification signal. With how 66 + * signaling works for task_work, we can find it set with nothing to 67 + * run. We need to clear it for that case, like get_signal() does. 68 + */ 69 + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 70 + clear_notify_signal(); 71 + /* 72 + * PF_IO_WORKER never returns to userspace, so check here if we have 73 + * notify work that needs processing. 74 + */ 75 + if (current->flags & PF_IO_WORKER) { 76 + if (test_thread_flag(TIF_NOTIFY_RESUME)) { 77 + __set_current_state(TASK_RUNNING); 78 + resume_user_mode_work(NULL); 79 + } 80 + if (current->io_uring) { 81 + unsigned int count = 0; 82 + 83 + __set_current_state(TASK_RUNNING); 84 + tctx_task_work_run(current->io_uring, UINT_MAX, &count); 85 + if (count) 86 + ret = true; 87 + } 88 + } 89 + if (task_work_pending(current)) { 90 + __set_current_state(TASK_RUNNING); 91 + task_work_run(); 92 + ret = true; 93 + } 94 + 95 + return ret; 96 + } 97 + 98 + static inline bool io_local_work_pending(struct io_ring_ctx *ctx) 99 + { 100 + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); 101 + } 102 + 103 + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 104 + { 105 + return task_work_pending(current) || io_local_work_pending(ctx); 106 + } 107 + 108 + static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) 109 + { 110 + lockdep_assert_held(&ctx->uring_lock); 111 + } 112 + 113 + static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) 114 + { 115 + return likely(ctx->submitter_task == current); 116 + } 117 + 118 + static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) 119 + { 120 + return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || 121 + ctx->submitter_task == current); 122 + } 123 + 124 + #endif

Configure Feed

Configure Feed