Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: move SQPOLL related handling into its own file

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+497 -462
+2 -1
io_uring/Makefile
··· 5 5 obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ 6 6 sync.o advise.o filetable.o \ 7 7 openclose.o uring_cmd.o epoll.o \ 8 - statx.o net.o msg_ring.o timeout.o 8 + statx.o net.o msg_ring.o timeout.o \ 9 + sqpoll.o 9 10 obj-$(CONFIG_IO_WQ) += io-wq.o
+6 -461
io_uring/io_uring.c
··· 92 92 #include "io_uring_types.h" 93 93 #include "io_uring.h" 94 94 #include "refs.h" 95 + #include "sqpoll.h" 95 96 96 97 #include "xattr.h" 97 98 #include "nop.h" ··· 110 109 111 110 #define IORING_MAX_ENTRIES 32768 112 111 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 113 - #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 114 112 115 113 /* only define max */ 116 114 #define IORING_MAX_FIXED_FILES (1U << 20) ··· 212 212 __u32 len; 213 213 __u16 bid; 214 214 __u16 bgid; 215 - }; 216 - 217 - enum { 218 - IO_SQ_THREAD_SHOULD_STOP = 0, 219 - IO_SQ_THREAD_SHOULD_PARK, 220 - }; 221 - 222 - struct io_sq_data { 223 - refcount_t refs; 224 - atomic_t park_pending; 225 - struct mutex lock; 226 - 227 - /* ctx's that are using this sqd */ 228 - struct list_head ctx_list; 229 - 230 - struct task_struct *thread; 231 - struct wait_queue_head wait; 232 - 233 - unsigned sq_thread_idle; 234 - int sq_cpu; 235 - pid_t task_pid; 236 - pid_t task_tgid; 237 - 238 - unsigned long state; 239 - struct completion exited; 240 215 }; 241 216 242 217 #define IO_COMPL_BATCH 32 ··· 377 402 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 378 403 struct task_struct *task, 379 404 bool cancel_all); 380 - static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 381 405 382 406 static void io_dismantle_req(struct io_kiocb *req); 383 407 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, ··· 1051 1077 } 1052 1078 if (ctx->has_evfd) 1053 1079 io_eventfd_signal(ctx); 1054 - } 1055 - 1056 - static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1057 - { 1058 - struct io_rings *r = ctx->rings; 1059 - 1060 - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 1061 1080 } 1062 1081 1063 1082 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) ··· 1941 1974 return __io_cqring_events(ctx); 1942 1975 } 1943 1976 1944 - static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 1945 - { 1946 - struct io_rings *rings = ctx->rings; 1947 - 1948 - /* make sure SQ entry isn't read before tail */ 1949 - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 1950 - } 1951 - 1952 - static inline bool io_run_task_work(void) 1953 - { 1954 - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { 1955 - __set_current_state(TASK_RUNNING); 1956 - clear_notify_signal(); 1957 - if (task_work_pending(current)) 1958 - task_work_run(); 1959 - return true; 1960 - } 1961 - 1962 - return false; 1963 - } 1964 - 1965 - static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 1977 + int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 1966 1978 { 1967 1979 struct io_wq_work_node *pos, *start, *prev; 1968 1980 unsigned int poll_flags = BLK_POLL_NOSLEEP; ··· 5243 5297 return NULL; 5244 5298 } 5245 5299 5246 - static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 5300 + int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 5247 5301 __must_hold(&ctx->uring_lock) 5248 5302 { 5249 5303 unsigned int entries = io_sqring_entries(ctx); ··· 5293 5347 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 5294 5348 io_commit_sqring(ctx); 5295 5349 return ret; 5296 - } 5297 - 5298 - static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 5299 - { 5300 - return READ_ONCE(sqd->state); 5301 - } 5302 - 5303 - static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 5304 - { 5305 - unsigned int to_submit; 5306 - int ret = 0; 5307 - 5308 - to_submit = io_sqring_entries(ctx); 5309 - /* if we're handling multiple rings, cap submit size for fairness */ 5310 - if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 5311 - to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 5312 - 5313 - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { 5314 - const struct cred *creds = NULL; 5315 - 5316 - if (ctx->sq_creds != current_cred()) 5317 - creds = override_creds(ctx->sq_creds); 5318 - 5319 - mutex_lock(&ctx->uring_lock); 5320 - if (!wq_list_empty(&ctx->iopoll_list)) 5321 - io_do_iopoll(ctx, true); 5322 - 5323 - /* 5324 - * Don't submit if refs are dying, good for io_uring_register(), 5325 - * but also it is relied upon by io_ring_exit_work() 5326 - */ 5327 - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 5328 - !(ctx->flags & IORING_SETUP_R_DISABLED)) 5329 - ret = io_submit_sqes(ctx, to_submit); 5330 - mutex_unlock(&ctx->uring_lock); 5331 - 5332 - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 5333 - wake_up(&ctx->sqo_sq_wait); 5334 - if (creds) 5335 - revert_creds(creds); 5336 - } 5337 - 5338 - return ret; 5339 - } 5340 - 5341 - static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) 5342 - { 5343 - struct io_ring_ctx *ctx; 5344 - unsigned sq_thread_idle = 0; 5345 - 5346 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 5347 - sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 5348 - sqd->sq_thread_idle = sq_thread_idle; 5349 - } 5350 - 5351 - static bool io_sqd_handle_event(struct io_sq_data *sqd) 5352 - { 5353 - bool did_sig = false; 5354 - struct ksignal ksig; 5355 - 5356 - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 5357 - signal_pending(current)) { 5358 - mutex_unlock(&sqd->lock); 5359 - if (signal_pending(current)) 5360 - did_sig = get_signal(&ksig); 5361 - cond_resched(); 5362 - mutex_lock(&sqd->lock); 5363 - } 5364 - return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 5365 - } 5366 - 5367 - static int io_sq_thread(void *data) 5368 - { 5369 - struct io_sq_data *sqd = data; 5370 - struct io_ring_ctx *ctx; 5371 - unsigned long timeout = 0; 5372 - char buf[TASK_COMM_LEN]; 5373 - DEFINE_WAIT(wait); 5374 - 5375 - snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 5376 - set_task_comm(current, buf); 5377 - 5378 - if (sqd->sq_cpu != -1) 5379 - set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 5380 - else 5381 - set_cpus_allowed_ptr(current, cpu_online_mask); 5382 - current->flags |= PF_NO_SETAFFINITY; 5383 - 5384 - audit_alloc_kernel(current); 5385 - 5386 - mutex_lock(&sqd->lock); 5387 - while (1) { 5388 - bool cap_entries, sqt_spin = false; 5389 - 5390 - if (io_sqd_events_pending(sqd) || signal_pending(current)) { 5391 - if (io_sqd_handle_event(sqd)) 5392 - break; 5393 - timeout = jiffies + sqd->sq_thread_idle; 5394 - } 5395 - 5396 - cap_entries = !list_is_singular(&sqd->ctx_list); 5397 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 5398 - int ret = __io_sq_thread(ctx, cap_entries); 5399 - 5400 - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 5401 - sqt_spin = true; 5402 - } 5403 - if (io_run_task_work()) 5404 - sqt_spin = true; 5405 - 5406 - if (sqt_spin || !time_after(jiffies, timeout)) { 5407 - cond_resched(); 5408 - if (sqt_spin) 5409 - timeout = jiffies + sqd->sq_thread_idle; 5410 - continue; 5411 - } 5412 - 5413 - prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 5414 - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { 5415 - bool needs_sched = true; 5416 - 5417 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 5418 - atomic_or(IORING_SQ_NEED_WAKEUP, 5419 - &ctx->rings->sq_flags); 5420 - if ((ctx->flags & IORING_SETUP_IOPOLL) && 5421 - !wq_list_empty(&ctx->iopoll_list)) { 5422 - needs_sched = false; 5423 - break; 5424 - } 5425 - 5426 - /* 5427 - * Ensure the store of the wakeup flag is not 5428 - * reordered with the load of the SQ tail 5429 - */ 5430 - smp_mb__after_atomic(); 5431 - 5432 - if (io_sqring_entries(ctx)) { 5433 - needs_sched = false; 5434 - break; 5435 - } 5436 - } 5437 - 5438 - if (needs_sched) { 5439 - mutex_unlock(&sqd->lock); 5440 - schedule(); 5441 - mutex_lock(&sqd->lock); 5442 - } 5443 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 5444 - atomic_andnot(IORING_SQ_NEED_WAKEUP, 5445 - &ctx->rings->sq_flags); 5446 - } 5447 - 5448 - finish_wait(&sqd->wait, &wait); 5449 - timeout = jiffies + sqd->sq_thread_idle; 5450 - } 5451 - 5452 - io_uring_cancel_generic(true, sqd); 5453 - sqd->thread = NULL; 5454 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 5455 - atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); 5456 - io_run_task_work(); 5457 - mutex_unlock(&sqd->lock); 5458 - 5459 - audit_free(current); 5460 - 5461 - complete(&sqd->exited); 5462 - do_exit(0); 5463 5350 } 5464 5351 5465 5352 struct io_wait_queue { ··· 5711 5932 if (!ret) 5712 5933 __io_sqe_files_unregister(ctx); 5713 5934 return ret; 5714 - } 5715 - 5716 - static void io_sq_thread_unpark(struct io_sq_data *sqd) 5717 - __releases(&sqd->lock) 5718 - { 5719 - WARN_ON_ONCE(sqd->thread == current); 5720 - 5721 - /* 5722 - * Do the dance but not conditional clear_bit() because it'd race with 5723 - * other threads incrementing park_pending and setting the bit. 5724 - */ 5725 - clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 5726 - if (atomic_dec_return(&sqd->park_pending)) 5727 - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 5728 - mutex_unlock(&sqd->lock); 5729 - } 5730 - 5731 - static void io_sq_thread_park(struct io_sq_data *sqd) 5732 - __acquires(&sqd->lock) 5733 - { 5734 - WARN_ON_ONCE(sqd->thread == current); 5735 - 5736 - atomic_inc(&sqd->park_pending); 5737 - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 5738 - mutex_lock(&sqd->lock); 5739 - if (sqd->thread) 5740 - wake_up_process(sqd->thread); 5741 - } 5742 - 5743 - static void io_sq_thread_stop(struct io_sq_data *sqd) 5744 - { 5745 - WARN_ON_ONCE(sqd->thread == current); 5746 - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 5747 - 5748 - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 5749 - mutex_lock(&sqd->lock); 5750 - if (sqd->thread) 5751 - wake_up_process(sqd->thread); 5752 - mutex_unlock(&sqd->lock); 5753 - wait_for_completion(&sqd->exited); 5754 - } 5755 - 5756 - static void io_put_sq_data(struct io_sq_data *sqd) 5757 - { 5758 - if (refcount_dec_and_test(&sqd->refs)) { 5759 - WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 5760 - 5761 - io_sq_thread_stop(sqd); 5762 - kfree(sqd); 5763 - } 5764 - } 5765 - 5766 - static void io_sq_thread_finish(struct io_ring_ctx *ctx) 5767 - { 5768 - struct io_sq_data *sqd = ctx->sq_data; 5769 - 5770 - if (sqd) { 5771 - io_sq_thread_park(sqd); 5772 - list_del_init(&ctx->sqd_list); 5773 - io_sqd_update_thread_idle(sqd); 5774 - io_sq_thread_unpark(sqd); 5775 - 5776 - io_put_sq_data(sqd); 5777 - ctx->sq_data = NULL; 5778 - } 5779 - } 5780 - 5781 - static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 5782 - { 5783 - struct io_ring_ctx *ctx_attach; 5784 - struct io_sq_data *sqd; 5785 - struct fd f; 5786 - 5787 - f = fdget(p->wq_fd); 5788 - if (!f.file) 5789 - return ERR_PTR(-ENXIO); 5790 - if (f.file->f_op != &io_uring_fops) { 5791 - fdput(f); 5792 - return ERR_PTR(-EINVAL); 5793 - } 5794 - 5795 - ctx_attach = f.file->private_data; 5796 - sqd = ctx_attach->sq_data; 5797 - if (!sqd) { 5798 - fdput(f); 5799 - return ERR_PTR(-EINVAL); 5800 - } 5801 - if (sqd->task_tgid != current->tgid) { 5802 - fdput(f); 5803 - return ERR_PTR(-EPERM); 5804 - } 5805 - 5806 - refcount_inc(&sqd->refs); 5807 - fdput(f); 5808 - return sqd; 5809 - } 5810 - 5811 - static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 5812 - bool *attached) 5813 - { 5814 - struct io_sq_data *sqd; 5815 - 5816 - *attached = false; 5817 - if (p->flags & IORING_SETUP_ATTACH_WQ) { 5818 - sqd = io_attach_sq_data(p); 5819 - if (!IS_ERR(sqd)) { 5820 - *attached = true; 5821 - return sqd; 5822 - } 5823 - /* fall through for EPERM case, setup new sqd/task */ 5824 - if (PTR_ERR(sqd) != -EPERM) 5825 - return sqd; 5826 - } 5827 - 5828 - sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 5829 - if (!sqd) 5830 - return ERR_PTR(-ENOMEM); 5831 - 5832 - atomic_set(&sqd->park_pending, 0); 5833 - refcount_set(&sqd->refs, 1); 5834 - INIT_LIST_HEAD(&sqd->ctx_list); 5835 - mutex_init(&sqd->lock); 5836 - init_waitqueue_head(&sqd->wait); 5837 - init_completion(&sqd->exited); 5838 - return sqd; 5839 5935 } 5840 5936 5841 5937 /* ··· 6149 6495 return io_wq_create(concurrency, &data); 6150 6496 } 6151 6497 6152 - static __cold int io_uring_alloc_task_context(struct task_struct *task, 6153 - struct io_ring_ctx *ctx) 6498 + __cold int io_uring_alloc_task_context(struct task_struct *task, 6499 + struct io_ring_ctx *ctx) 6154 6500 { 6155 6501 struct io_uring_task *tctx; 6156 6502 int ret; ··· 6206 6552 percpu_counter_destroy(&tctx->inflight); 6207 6553 kfree(tctx); 6208 6554 tsk->io_uring = NULL; 6209 - } 6210 - 6211 - static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, 6212 - struct io_uring_params *p) 6213 - { 6214 - int ret; 6215 - 6216 - /* Retain compatibility with failing for an invalid attach attempt */ 6217 - if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 6218 - IORING_SETUP_ATTACH_WQ) { 6219 - struct fd f; 6220 - 6221 - f = fdget(p->wq_fd); 6222 - if (!f.file) 6223 - return -ENXIO; 6224 - if (f.file->f_op != &io_uring_fops) { 6225 - fdput(f); 6226 - return -EINVAL; 6227 - } 6228 - fdput(f); 6229 - } 6230 - if (ctx->flags & IORING_SETUP_SQPOLL) { 6231 - struct task_struct *tsk; 6232 - struct io_sq_data *sqd; 6233 - bool attached; 6234 - 6235 - ret = security_uring_sqpoll(); 6236 - if (ret) 6237 - return ret; 6238 - 6239 - sqd = io_get_sq_data(p, &attached); 6240 - if (IS_ERR(sqd)) { 6241 - ret = PTR_ERR(sqd); 6242 - goto err; 6243 - } 6244 - 6245 - ctx->sq_creds = get_current_cred(); 6246 - ctx->sq_data = sqd; 6247 - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 6248 - if (!ctx->sq_thread_idle) 6249 - ctx->sq_thread_idle = HZ; 6250 - 6251 - io_sq_thread_park(sqd); 6252 - list_add(&ctx->sqd_list, &sqd->ctx_list); 6253 - io_sqd_update_thread_idle(sqd); 6254 - /* don't attach to a dying SQPOLL thread, would be racy */ 6255 - ret = (attached && !sqd->thread) ? -ENXIO : 0; 6256 - io_sq_thread_unpark(sqd); 6257 - 6258 - if (ret < 0) 6259 - goto err; 6260 - if (attached) 6261 - return 0; 6262 - 6263 - if (p->flags & IORING_SETUP_SQ_AFF) { 6264 - int cpu = p->sq_thread_cpu; 6265 - 6266 - ret = -EINVAL; 6267 - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 6268 - goto err_sqpoll; 6269 - sqd->sq_cpu = cpu; 6270 - } else { 6271 - sqd->sq_cpu = -1; 6272 - } 6273 - 6274 - sqd->task_pid = current->pid; 6275 - sqd->task_tgid = current->tgid; 6276 - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 6277 - if (IS_ERR(tsk)) { 6278 - ret = PTR_ERR(tsk); 6279 - goto err_sqpoll; 6280 - } 6281 - 6282 - sqd->thread = tsk; 6283 - ret = io_uring_alloc_task_context(tsk, ctx); 6284 - wake_up_new_task(tsk); 6285 - if (ret) 6286 - goto err; 6287 - } else if (p->flags & IORING_SETUP_SQ_AFF) { 6288 - /* Can't have SQ_AFF without SQPOLL */ 6289 - ret = -EINVAL; 6290 - goto err; 6291 - } 6292 - 6293 - return 0; 6294 - err_sqpoll: 6295 - complete(&ctx->sq_data->exited); 6296 - err: 6297 - io_sq_thread_finish(ctx); 6298 - return ret; 6299 6555 } 6300 6556 6301 6557 static inline void __io_unaccount_mem(struct user_struct *user, ··· 7319 7755 * Find any io_uring ctx that this task has registered or done IO on, and cancel 7320 7756 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 7321 7757 */ 7322 - static __cold void io_uring_cancel_generic(bool cancel_all, 7323 - struct io_sq_data *sqd) 7758 + __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 7324 7759 { 7325 7760 struct io_uring_task *tctx = current->io_uring; 7326 7761 struct io_ring_ctx *ctx; ··· 7596 8033 } 7597 8034 7598 8035 #endif /* !CONFIG_MMU */ 7599 - 7600 - static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 7601 - { 7602 - DEFINE_WAIT(wait); 7603 - 7604 - do { 7605 - if (!io_sqring_full(ctx)) 7606 - break; 7607 - prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 7608 - 7609 - if (!io_sqring_full(ctx)) 7610 - break; 7611 - schedule(); 7612 - } while (!signal_pending(current)); 7613 - 7614 - finish_wait(&ctx->sqo_sq_wait, &wait); 7615 - return 0; 7616 - } 7617 8036 7618 8037 static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) 7619 8038 {
+34
io_uring/io_uring.h
··· 64 64 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 65 65 } 66 66 67 + static inline bool io_sqring_full(struct io_ring_ctx *ctx) 68 + { 69 + struct io_rings *r = ctx->rings; 70 + 71 + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 72 + } 73 + 74 + static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 75 + { 76 + struct io_rings *rings = ctx->rings; 77 + 78 + /* make sure SQ entry isn't read before tail */ 79 + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 80 + } 81 + 82 + static inline bool io_run_task_work(void) 83 + { 84 + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { 85 + __set_current_state(TASK_RUNNING); 86 + clear_notify_signal(); 87 + if (task_work_pending(current)) 88 + task_work_run(); 89 + return true; 90 + } 91 + 92 + return false; 93 + } 94 + 67 95 void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); 68 96 void io_req_complete_post(struct io_kiocb *req); 69 97 void __io_req_complete_post(struct io_kiocb *req); ··· 129 101 void io_req_task_complete(struct io_kiocb *req, bool *locked); 130 102 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 131 103 int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); 104 + __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 105 + int io_uring_alloc_task_context(struct task_struct *task, 106 + struct io_ring_ctx *ctx); 107 + 108 + int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 109 + int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 132 110 133 111 void io_free_req(struct io_kiocb *req); 134 112 void io_queue_next(struct io_kiocb *req);
+426
io_uring/sqpoll.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Contains the core associated with submission side polling of the SQ 4 + * ring, offloading submissions from the application to a kernel thread. 5 + */ 6 + #include <linux/kernel.h> 7 + #include <linux/errno.h> 8 + #include <linux/file.h> 9 + #include <linux/mm.h> 10 + #include <linux/slab.h> 11 + #include <linux/audit.h> 12 + #include <linux/security.h> 13 + #include <linux/io_uring.h> 14 + 15 + #include <uapi/linux/io_uring.h> 16 + 17 + #include "io_uring_types.h" 18 + #include "io_uring.h" 19 + #include "sqpoll.h" 20 + 21 + #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 22 + 23 + enum { 24 + IO_SQ_THREAD_SHOULD_STOP = 0, 25 + IO_SQ_THREAD_SHOULD_PARK, 26 + }; 27 + 28 + void io_sq_thread_unpark(struct io_sq_data *sqd) 29 + __releases(&sqd->lock) 30 + { 31 + WARN_ON_ONCE(sqd->thread == current); 32 + 33 + /* 34 + * Do the dance but not conditional clear_bit() because it'd race with 35 + * other threads incrementing park_pending and setting the bit. 36 + */ 37 + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 38 + if (atomic_dec_return(&sqd->park_pending)) 39 + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 40 + mutex_unlock(&sqd->lock); 41 + } 42 + 43 + void io_sq_thread_park(struct io_sq_data *sqd) 44 + __acquires(&sqd->lock) 45 + { 46 + WARN_ON_ONCE(sqd->thread == current); 47 + 48 + atomic_inc(&sqd->park_pending); 49 + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 50 + mutex_lock(&sqd->lock); 51 + if (sqd->thread) 52 + wake_up_process(sqd->thread); 53 + } 54 + 55 + void io_sq_thread_stop(struct io_sq_data *sqd) 56 + { 57 + WARN_ON_ONCE(sqd->thread == current); 58 + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 59 + 60 + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 61 + mutex_lock(&sqd->lock); 62 + if (sqd->thread) 63 + wake_up_process(sqd->thread); 64 + mutex_unlock(&sqd->lock); 65 + wait_for_completion(&sqd->exited); 66 + } 67 + 68 + void io_put_sq_data(struct io_sq_data *sqd) 69 + { 70 + if (refcount_dec_and_test(&sqd->refs)) { 71 + WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 72 + 73 + io_sq_thread_stop(sqd); 74 + kfree(sqd); 75 + } 76 + } 77 + 78 + static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) 79 + { 80 + struct io_ring_ctx *ctx; 81 + unsigned sq_thread_idle = 0; 82 + 83 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 84 + sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 85 + sqd->sq_thread_idle = sq_thread_idle; 86 + } 87 + 88 + void io_sq_thread_finish(struct io_ring_ctx *ctx) 89 + { 90 + struct io_sq_data *sqd = ctx->sq_data; 91 + 92 + if (sqd) { 93 + io_sq_thread_park(sqd); 94 + list_del_init(&ctx->sqd_list); 95 + io_sqd_update_thread_idle(sqd); 96 + io_sq_thread_unpark(sqd); 97 + 98 + io_put_sq_data(sqd); 99 + ctx->sq_data = NULL; 100 + } 101 + } 102 + 103 + static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 104 + { 105 + struct io_ring_ctx *ctx_attach; 106 + struct io_sq_data *sqd; 107 + struct fd f; 108 + 109 + f = fdget(p->wq_fd); 110 + if (!f.file) 111 + return ERR_PTR(-ENXIO); 112 + if (!io_is_uring_fops(f.file)) { 113 + fdput(f); 114 + return ERR_PTR(-EINVAL); 115 + } 116 + 117 + ctx_attach = f.file->private_data; 118 + sqd = ctx_attach->sq_data; 119 + if (!sqd) { 120 + fdput(f); 121 + return ERR_PTR(-EINVAL); 122 + } 123 + if (sqd->task_tgid != current->tgid) { 124 + fdput(f); 125 + return ERR_PTR(-EPERM); 126 + } 127 + 128 + refcount_inc(&sqd->refs); 129 + fdput(f); 130 + return sqd; 131 + } 132 + 133 + static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 134 + bool *attached) 135 + { 136 + struct io_sq_data *sqd; 137 + 138 + *attached = false; 139 + if (p->flags & IORING_SETUP_ATTACH_WQ) { 140 + sqd = io_attach_sq_data(p); 141 + if (!IS_ERR(sqd)) { 142 + *attached = true; 143 + return sqd; 144 + } 145 + /* fall through for EPERM case, setup new sqd/task */ 146 + if (PTR_ERR(sqd) != -EPERM) 147 + return sqd; 148 + } 149 + 150 + sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 151 + if (!sqd) 152 + return ERR_PTR(-ENOMEM); 153 + 154 + atomic_set(&sqd->park_pending, 0); 155 + refcount_set(&sqd->refs, 1); 156 + INIT_LIST_HEAD(&sqd->ctx_list); 157 + mutex_init(&sqd->lock); 158 + init_waitqueue_head(&sqd->wait); 159 + init_completion(&sqd->exited); 160 + return sqd; 161 + } 162 + 163 + static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 164 + { 165 + return READ_ONCE(sqd->state); 166 + } 167 + 168 + static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 169 + { 170 + unsigned int to_submit; 171 + int ret = 0; 172 + 173 + to_submit = io_sqring_entries(ctx); 174 + /* if we're handling multiple rings, cap submit size for fairness */ 175 + if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 176 + to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 177 + 178 + if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { 179 + const struct cred *creds = NULL; 180 + 181 + if (ctx->sq_creds != current_cred()) 182 + creds = override_creds(ctx->sq_creds); 183 + 184 + mutex_lock(&ctx->uring_lock); 185 + if (!wq_list_empty(&ctx->iopoll_list)) 186 + io_do_iopoll(ctx, true); 187 + 188 + /* 189 + * Don't submit if refs are dying, good for io_uring_register(), 190 + * but also it is relied upon by io_ring_exit_work() 191 + */ 192 + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 193 + !(ctx->flags & IORING_SETUP_R_DISABLED)) 194 + ret = io_submit_sqes(ctx, to_submit); 195 + mutex_unlock(&ctx->uring_lock); 196 + 197 + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 198 + wake_up(&ctx->sqo_sq_wait); 199 + if (creds) 200 + revert_creds(creds); 201 + } 202 + 203 + return ret; 204 + } 205 + 206 + static bool io_sqd_handle_event(struct io_sq_data *sqd) 207 + { 208 + bool did_sig = false; 209 + struct ksignal ksig; 210 + 211 + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 212 + signal_pending(current)) { 213 + mutex_unlock(&sqd->lock); 214 + if (signal_pending(current)) 215 + did_sig = get_signal(&ksig); 216 + cond_resched(); 217 + mutex_lock(&sqd->lock); 218 + } 219 + return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 220 + } 221 + 222 + static int io_sq_thread(void *data) 223 + { 224 + struct io_sq_data *sqd = data; 225 + struct io_ring_ctx *ctx; 226 + unsigned long timeout = 0; 227 + char buf[TASK_COMM_LEN]; 228 + DEFINE_WAIT(wait); 229 + 230 + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 231 + set_task_comm(current, buf); 232 + 233 + if (sqd->sq_cpu != -1) 234 + set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 235 + else 236 + set_cpus_allowed_ptr(current, cpu_online_mask); 237 + current->flags |= PF_NO_SETAFFINITY; 238 + 239 + audit_alloc_kernel(current); 240 + 241 + mutex_lock(&sqd->lock); 242 + while (1) { 243 + bool cap_entries, sqt_spin = false; 244 + 245 + if (io_sqd_events_pending(sqd) || signal_pending(current)) { 246 + if (io_sqd_handle_event(sqd)) 247 + break; 248 + timeout = jiffies + sqd->sq_thread_idle; 249 + } 250 + 251 + cap_entries = !list_is_singular(&sqd->ctx_list); 252 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 253 + int ret = __io_sq_thread(ctx, cap_entries); 254 + 255 + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 256 + sqt_spin = true; 257 + } 258 + if (io_run_task_work()) 259 + sqt_spin = true; 260 + 261 + if (sqt_spin || !time_after(jiffies, timeout)) { 262 + cond_resched(); 263 + if (sqt_spin) 264 + timeout = jiffies + sqd->sq_thread_idle; 265 + continue; 266 + } 267 + 268 + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 269 + if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { 270 + bool needs_sched = true; 271 + 272 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 273 + atomic_or(IORING_SQ_NEED_WAKEUP, 274 + &ctx->rings->sq_flags); 275 + if ((ctx->flags & IORING_SETUP_IOPOLL) && 276 + !wq_list_empty(&ctx->iopoll_list)) { 277 + needs_sched = false; 278 + break; 279 + } 280 + 281 + /* 282 + * Ensure the store of the wakeup flag is not 283 + * reordered with the load of the SQ tail 284 + */ 285 + smp_mb__after_atomic(); 286 + 287 + if (io_sqring_entries(ctx)) { 288 + needs_sched = false; 289 + break; 290 + } 291 + } 292 + 293 + if (needs_sched) { 294 + mutex_unlock(&sqd->lock); 295 + schedule(); 296 + mutex_lock(&sqd->lock); 297 + } 298 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 299 + atomic_andnot(IORING_SQ_NEED_WAKEUP, 300 + &ctx->rings->sq_flags); 301 + } 302 + 303 + finish_wait(&sqd->wait, &wait); 304 + timeout = jiffies + sqd->sq_thread_idle; 305 + } 306 + 307 + io_uring_cancel_generic(true, sqd); 308 + sqd->thread = NULL; 309 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 310 + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); 311 + io_run_task_work(); 312 + mutex_unlock(&sqd->lock); 313 + 314 + audit_free(current); 315 + 316 + complete(&sqd->exited); 317 + do_exit(0); 318 + } 319 + 320 + int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 321 + { 322 + DEFINE_WAIT(wait); 323 + 324 + do { 325 + if (!io_sqring_full(ctx)) 326 + break; 327 + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 328 + 329 + if (!io_sqring_full(ctx)) 330 + break; 331 + schedule(); 332 + } while (!signal_pending(current)); 333 + 334 + finish_wait(&ctx->sqo_sq_wait, &wait); 335 + return 0; 336 + } 337 + 338 + __cold int io_sq_offload_create(struct io_ring_ctx *ctx, 339 + struct io_uring_params *p) 340 + { 341 + int ret; 342 + 343 + /* Retain compatibility with failing for an invalid attach attempt */ 344 + if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 345 + IORING_SETUP_ATTACH_WQ) { 346 + struct fd f; 347 + 348 + f = fdget(p->wq_fd); 349 + if (!f.file) 350 + return -ENXIO; 351 + if (!io_is_uring_fops(f.file)) { 352 + fdput(f); 353 + return -EINVAL; 354 + } 355 + fdput(f); 356 + } 357 + if (ctx->flags & IORING_SETUP_SQPOLL) { 358 + struct task_struct *tsk; 359 + struct io_sq_data *sqd; 360 + bool attached; 361 + 362 + ret = security_uring_sqpoll(); 363 + if (ret) 364 + return ret; 365 + 366 + sqd = io_get_sq_data(p, &attached); 367 + if (IS_ERR(sqd)) { 368 + ret = PTR_ERR(sqd); 369 + goto err; 370 + } 371 + 372 + ctx->sq_creds = get_current_cred(); 373 + ctx->sq_data = sqd; 374 + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 375 + if (!ctx->sq_thread_idle) 376 + ctx->sq_thread_idle = HZ; 377 + 378 + io_sq_thread_park(sqd); 379 + list_add(&ctx->sqd_list, &sqd->ctx_list); 380 + io_sqd_update_thread_idle(sqd); 381 + /* don't attach to a dying SQPOLL thread, would be racy */ 382 + ret = (attached && !sqd->thread) ? -ENXIO : 0; 383 + io_sq_thread_unpark(sqd); 384 + 385 + if (ret < 0) 386 + goto err; 387 + if (attached) 388 + return 0; 389 + 390 + if (p->flags & IORING_SETUP_SQ_AFF) { 391 + int cpu = p->sq_thread_cpu; 392 + 393 + ret = -EINVAL; 394 + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 395 + goto err_sqpoll; 396 + sqd->sq_cpu = cpu; 397 + } else { 398 + sqd->sq_cpu = -1; 399 + } 400 + 401 + sqd->task_pid = current->pid; 402 + sqd->task_tgid = current->tgid; 403 + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 404 + if (IS_ERR(tsk)) { 405 + ret = PTR_ERR(tsk); 406 + goto err_sqpoll; 407 + } 408 + 409 + sqd->thread = tsk; 410 + ret = io_uring_alloc_task_context(tsk, ctx); 411 + wake_up_new_task(tsk); 412 + if (ret) 413 + goto err; 414 + } else if (p->flags & IORING_SETUP_SQ_AFF) { 415 + /* Can't have SQ_AFF without SQPOLL */ 416 + ret = -EINVAL; 417 + goto err; 418 + } 419 + 420 + return 0; 421 + err_sqpoll: 422 + complete(&ctx->sq_data->exited); 423 + err: 424 + io_sq_thread_finish(ctx); 425 + return ret; 426 + }
+29
io_uring/sqpoll.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + struct io_sq_data { 4 + refcount_t refs; 5 + atomic_t park_pending; 6 + struct mutex lock; 7 + 8 + /* ctx's that are using this sqd */ 9 + struct list_head ctx_list; 10 + 11 + struct task_struct *thread; 12 + struct wait_queue_head wait; 13 + 14 + unsigned sq_thread_idle; 15 + int sq_cpu; 16 + pid_t task_pid; 17 + pid_t task_tgid; 18 + 19 + unsigned long state; 20 + struct completion exited; 21 + }; 22 + 23 + int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p); 24 + void io_sq_thread_finish(struct io_ring_ctx *ctx); 25 + void io_sq_thread_stop(struct io_sq_data *sqd); 26 + void io_sq_thread_park(struct io_sq_data *sqd); 27 + void io_sq_thread_unpark(struct io_sq_data *sqd); 28 + void io_put_sq_data(struct io_sq_data *sqd); 29 + int io_sqpoll_wait_sq(struct io_ring_ctx *ctx);