Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring/sqpoll: manage task_work privately

Decouple from task_work running, and cap the number of entries we process
at the time. If we exceed that number, push remaining entries to a retry
list that we'll process first next time.

We cap the number of entries to process at 8, which is fairly random.
We just want to get enough per-ctx batching here, while not processing
endlessly.

Since we manually run PF_IO_WORKER related task_work anyway as the task
never exits to userspace, with this we no longer need to add an actual
task_work item to the per-process list.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+82 -17
+36 -10
io_uring/io_uring.c
··· 1173 1173 percpu_ref_put(&ctx->refs); 1174 1174 } 1175 1175 1176 - static void handle_tw_list(struct llist_node *node, unsigned int *count) 1176 + /* 1177 + * Run queued task_work, returning the number of entries processed in *count. 1178 + * If more entries than max_entries are available, stop processing once this 1179 + * is reached and return the rest of the list. 1180 + */ 1181 + struct llist_node *io_handle_tw_list(struct llist_node *node, 1182 + unsigned int *count, 1183 + unsigned int max_entries) 1177 1184 { 1178 1185 struct io_ring_ctx *ctx = NULL; 1179 1186 struct io_tw_state ts = { }; ··· 1207 1200 ctx = NULL; 1208 1201 cond_resched(); 1209 1202 } 1210 - } while (node); 1203 + } while (node && *count < max_entries); 1211 1204 1212 1205 ctx_flush_and_put(ctx, &ts); 1206 + return node; 1213 1207 } 1214 1208 1215 1209 /** ··· 1255 1247 } 1256 1248 } 1257 1249 1258 - void tctx_task_work(struct callback_head *cb) 1250 + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, 1251 + unsigned int max_entries, 1252 + unsigned int *count) 1259 1253 { 1260 - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 1261 - task_work); 1262 1254 struct llist_node *node; 1263 - unsigned int count = 0; 1264 1255 1265 1256 if (unlikely(current->flags & PF_EXITING)) { 1266 1257 io_fallback_tw(tctx, true); 1267 - return; 1258 + return NULL; 1268 1259 } 1269 1260 1270 1261 node = llist_del_all(&tctx->task_list); 1271 - if (node) 1272 - handle_tw_list(llist_reverse_order(node), &count); 1262 + if (node) { 1263 + node = llist_reverse_order(node); 1264 + node = io_handle_tw_list(node, count, max_entries); 1265 + } 1273 1266 1274 1267 /* relaxed read is enough as only the task itself sets ->in_cancel */ 1275 1268 if (unlikely(atomic_read(&tctx->in_cancel))) 1276 1269 io_uring_drop_tctx_refs(current); 1277 1270 1278 - trace_io_uring_task_work_run(tctx, count); 1271 + trace_io_uring_task_work_run(tctx, *count); 1272 + return node; 1273 + } 1274 + 1275 + void tctx_task_work(struct callback_head *cb) 1276 + { 1277 + struct io_uring_task *tctx; 1278 + struct llist_node *ret; 1279 + unsigned int count = 0; 1280 + 1281 + tctx = container_of(cb, struct io_uring_task, task_work); 1282 + ret = tctx_task_work_run(tctx, UINT_MAX, &count); 1283 + /* can't happen */ 1284 + WARN_ON_ONCE(ret); 1279 1285 } 1280 1286 1281 1287 static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) ··· 1371 1349 1372 1350 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1373 1351 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1352 + 1353 + /* SQPOLL doesn't need the task_work added, it'll run it itself */ 1354 + if (ctx->flags & IORING_SETUP_SQPOLL) 1355 + return; 1374 1356 1375 1357 if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) 1376 1358 return;
+18 -6
io_uring/io_uring.h
··· 57 57 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); 58 58 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 59 59 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); 60 + struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 61 + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 60 62 void tctx_task_work(struct callback_head *cb); 61 63 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 62 64 int io_uring_alloc_task_context(struct task_struct *task, ··· 277 275 278 276 static inline int io_run_task_work(void) 279 277 { 278 + bool ret = false; 279 + 280 280 /* 281 281 * Always check-and-clear the task_work notification signal. With how 282 282 * signaling works for task_work, we can find it set with nothing to ··· 290 286 * PF_IO_WORKER never returns to userspace, so check here if we have 291 287 * notify work that needs processing. 292 288 */ 293 - if (current->flags & PF_IO_WORKER && 294 - test_thread_flag(TIF_NOTIFY_RESUME)) { 295 - __set_current_state(TASK_RUNNING); 296 - resume_user_mode_work(NULL); 289 + if (current->flags & PF_IO_WORKER) { 290 + if (test_thread_flag(TIF_NOTIFY_RESUME)) { 291 + __set_current_state(TASK_RUNNING); 292 + resume_user_mode_work(NULL); 293 + } 294 + if (current->io_uring) { 295 + unsigned int count = 0; 296 + 297 + tctx_task_work_run(current->io_uring, UINT_MAX, &count); 298 + if (count) 299 + ret = true; 300 + } 297 301 } 298 302 if (task_work_pending(current)) { 299 303 __set_current_state(TASK_RUNNING); 300 304 task_work_run(); 301 - return 1; 305 + ret = true; 302 306 } 303 307 304 - return 0; 308 + return ret; 305 309 } 306 310 307 311 static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
+28 -1
io_uring/sqpoll.c
··· 18 18 #include "sqpoll.h" 19 19 20 20 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 21 + #define IORING_TW_CAP_ENTRIES_VALUE 8 21 22 22 23 enum { 23 24 IO_SQ_THREAD_SHOULD_STOP = 0, ··· 220 219 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 221 220 } 222 221 222 + /* 223 + * Run task_work, processing the retry_list first. The retry_list holds 224 + * entries that we passed on in the previous run, if we had more task_work 225 + * than we were asked to process. Newly queued task_work isn't run until the 226 + * retry list has been fully processed. 227 + */ 228 + static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) 229 + { 230 + struct io_uring_task *tctx = current->io_uring; 231 + unsigned int count = 0; 232 + 233 + if (*retry_list) { 234 + *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); 235 + if (count >= max_entries) 236 + return count; 237 + max_entries -= count; 238 + } 239 + 240 + *retry_list = tctx_task_work_run(tctx, max_entries, &count); 241 + return count; 242 + } 243 + 223 244 static int io_sq_thread(void *data) 224 245 { 246 + struct llist_node *retry_list = NULL; 225 247 struct io_sq_data *sqd = data; 226 248 struct io_ring_ctx *ctx; 227 249 unsigned long timeout = 0; ··· 281 257 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 282 258 sqt_spin = true; 283 259 } 284 - if (io_run_task_work()) 260 + if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) 285 261 sqt_spin = true; 286 262 287 263 if (sqt_spin || !time_after(jiffies, timeout)) { ··· 335 311 finish_wait(&sqd->wait, &wait); 336 312 timeout = jiffies + sqd->sq_thread_idle; 337 313 } 314 + 315 + if (retry_list) 316 + io_sq_tw(&retry_list, UINT_MAX); 338 317 339 318 io_uring_cancel_generic(true, sqd); 340 319 sqd->thread = NULL;