Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add IORING_OP_WAITID support

This adds support for an async version of waitid(2), in a fully async
version. If an event isn't immediately available, wait for a callback
to trigger a retry.

The format of the sqe is as follows:

sqe->len The 'which', the idtype being queried/waited for.
sqe->fd The 'pid' (or id) being waited for.
sqe->file_index The 'options' being set.
sqe->addr2 A pointer to siginfo_t, if any, being filled in.

buf_index, add3, and waitid_flags are reserved/unused for now.
waitid_flags will be used for options for this request type. One
interesting use case may be to add multi-shot support, so that the
request stays armed and posts a notification every time a monitored
process state change occurs.

Note that this does not support rusage, on Arnd's recommendation.

See the waitid(2) man page for details on the arguments.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+410 -1
+2
include/linux/io_uring_types.h
··· 313 313 struct list_head cq_overflow_list; 314 314 struct io_hash_table cancel_table; 315 315 316 + struct hlist_head waitid_list; 317 + 316 318 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 317 319 struct io_sq_data *sq_data; /* if using sq thread polling */ 318 320
+2
include/uapi/linux/io_uring.h
··· 65 65 __u32 xattr_flags; 66 66 __u32 msg_ring_flags; 67 67 __u32 uring_cmd_flags; 68 + __u32 waitid_flags; 68 69 }; 69 70 __u64 user_data; /* data to be passed back at completion time */ 70 71 /* pack this to avoid bogus arm OABI complaints */ ··· 242 241 IORING_OP_SEND_ZC, 243 242 IORING_OP_SENDMSG_ZC, 244 243 IORING_OP_READ_MULTISHOT, 244 + IORING_OP_WAITID, 245 245 246 246 /* this goes last, obviously */ 247 247 IORING_OP_LAST,
+2 -1
io_uring/Makefile
··· 7 7 openclose.o uring_cmd.o epoll.o \ 8 8 statx.o net.o msg_ring.o timeout.o \ 9 9 sqpoll.o fdinfo.o tctx.o poll.o \ 10 - cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o 10 + cancel.o kbuf.o rsrc.o rw.o opdef.o \ 11 + notif.o waitid.o 11 12 obj-$(CONFIG_IO_WQ) += io-wq.o
+5
io_uring/cancel.c
··· 15 15 #include "tctx.h" 16 16 #include "poll.h" 17 17 #include "timeout.h" 18 + #include "waitid.h" 18 19 #include "cancel.h" 19 20 20 21 struct io_cancel { ··· 117 116 return 0; 118 117 119 118 ret = io_poll_cancel(ctx, cd, issue_flags); 119 + if (ret != -ENOENT) 120 + return ret; 121 + 122 + ret = io_waitid_cancel(ctx, cd, issue_flags); 120 123 if (ret != -ENOENT) 121 124 return ret; 122 125
+3
io_uring/io_uring.c
··· 92 92 #include "cancel.h" 93 93 #include "net.h" 94 94 #include "notif.h" 95 + #include "waitid.h" 95 96 96 97 #include "timeout.h" 97 98 #include "poll.h" ··· 349 348 INIT_LIST_HEAD(&ctx->tctx_list); 350 349 ctx->submit_state.free_list.next = NULL; 351 350 INIT_WQ_LIST(&ctx->locked_free_list); 351 + INIT_HLIST_HEAD(&ctx->waitid_list); 352 352 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 353 353 INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 354 354 return ctx; ··· 3305 3303 ret |= io_cancel_defer_files(ctx, task, cancel_all); 3306 3304 mutex_lock(&ctx->uring_lock); 3307 3305 ret |= io_poll_remove_all(ctx, task, cancel_all); 3306 + ret |= io_waitid_remove_all(ctx, task, cancel_all); 3308 3307 mutex_unlock(&ctx->uring_lock); 3309 3308 ret |= io_kill_timeouts(ctx, task, cancel_all); 3310 3309 if (task)
+9
io_uring/opdef.c
··· 33 33 #include "poll.h" 34 34 #include "cancel.h" 35 35 #include "rw.h" 36 + #include "waitid.h" 36 37 37 38 static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) 38 39 { ··· 440 439 .prep = io_read_mshot_prep, 441 440 .issue = io_read_mshot, 442 441 }, 442 + [IORING_OP_WAITID] = { 443 + .prep = io_waitid_prep, 444 + .issue = io_waitid, 445 + }, 443 446 }; 444 447 445 448 const struct io_cold_def io_cold_defs[] = { ··· 665 660 }, 666 661 [IORING_OP_READ_MULTISHOT] = { 667 662 .name = "READ_MULTISHOT", 663 + }, 664 + [IORING_OP_WAITID] = { 665 + .name = "WAITID", 666 + .async_size = sizeof(struct io_waitid_async), 668 667 }, 669 668 }; 670 669
+372
io_uring/waitid.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Support for async notification of waitid 4 + */ 5 + #include <linux/kernel.h> 6 + #include <linux/errno.h> 7 + #include <linux/fs.h> 8 + #include <linux/file.h> 9 + #include <linux/compat.h> 10 + #include <linux/io_uring.h> 11 + 12 + #include <uapi/linux/io_uring.h> 13 + 14 + #include "io_uring.h" 15 + #include "cancel.h" 16 + #include "waitid.h" 17 + #include "../kernel/exit.h" 18 + 19 + static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); 20 + 21 + #define IO_WAITID_CANCEL_FLAG BIT(31) 22 + #define IO_WAITID_REF_MASK GENMASK(30, 0) 23 + 24 + struct io_waitid { 25 + struct file *file; 26 + int which; 27 + pid_t upid; 28 + int options; 29 + atomic_t refs; 30 + struct wait_queue_head *head; 31 + struct siginfo __user *infop; 32 + struct waitid_info info; 33 + }; 34 + 35 + static void io_waitid_free(struct io_kiocb *req) 36 + { 37 + struct io_waitid_async *iwa = req->async_data; 38 + 39 + put_pid(iwa->wo.wo_pid); 40 + kfree(req->async_data); 41 + req->async_data = NULL; 42 + req->flags &= ~REQ_F_ASYNC_DATA; 43 + } 44 + 45 + #ifdef CONFIG_COMPAT 46 + static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) 47 + { 48 + struct compat_siginfo __user *infop; 49 + bool ret; 50 + 51 + infop = (struct compat_siginfo __user *) iw->infop; 52 + 53 + if (!user_write_access_begin(infop, sizeof(*infop))) 54 + return false; 55 + 56 + unsafe_put_user(signo, &infop->si_signo, Efault); 57 + unsafe_put_user(0, &infop->si_errno, Efault); 58 + unsafe_put_user(iw->info.cause, &infop->si_code, Efault); 59 + unsafe_put_user(iw->info.pid, &infop->si_pid, Efault); 60 + unsafe_put_user(iw->info.uid, &infop->si_uid, Efault); 61 + unsafe_put_user(iw->info.status, &infop->si_status, Efault); 62 + ret = true; 63 + done: 64 + user_write_access_end(); 65 + return ret; 66 + Efault: 67 + ret = false; 68 + goto done; 69 + } 70 + #endif 71 + 72 + static bool io_waitid_copy_si(struct io_kiocb *req, int signo) 73 + { 74 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 75 + bool ret; 76 + 77 + if (!iw->infop) 78 + return true; 79 + 80 + #ifdef CONFIG_COMPAT 81 + if (req->ctx->compat) 82 + return io_waitid_compat_copy_si(iw, signo); 83 + #endif 84 + 85 + if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) 86 + return false; 87 + 88 + unsafe_put_user(signo, &iw->infop->si_signo, Efault); 89 + unsafe_put_user(0, &iw->infop->si_errno, Efault); 90 + unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault); 91 + unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault); 92 + unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault); 93 + unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault); 94 + ret = true; 95 + done: 96 + user_write_access_end(); 97 + return ret; 98 + Efault: 99 + ret = false; 100 + goto done; 101 + } 102 + 103 + static int io_waitid_finish(struct io_kiocb *req, int ret) 104 + { 105 + int signo = 0; 106 + 107 + if (ret > 0) { 108 + signo = SIGCHLD; 109 + ret = 0; 110 + } 111 + 112 + if (!io_waitid_copy_si(req, signo)) 113 + ret = -EFAULT; 114 + io_waitid_free(req); 115 + return ret; 116 + } 117 + 118 + static void io_waitid_complete(struct io_kiocb *req, int ret) 119 + { 120 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 121 + struct io_tw_state ts = { .locked = true }; 122 + 123 + /* anyone completing better be holding a reference */ 124 + WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); 125 + 126 + lockdep_assert_held(&req->ctx->uring_lock); 127 + 128 + /* 129 + * Did cancel find it meanwhile? 130 + */ 131 + if (hlist_unhashed(&req->hash_node)) 132 + return; 133 + 134 + hlist_del_init(&req->hash_node); 135 + 136 + ret = io_waitid_finish(req, ret); 137 + if (ret < 0) 138 + req_set_fail(req); 139 + io_req_set_res(req, ret, 0); 140 + io_req_task_complete(req, &ts); 141 + } 142 + 143 + static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) 144 + { 145 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 146 + struct io_waitid_async *iwa = req->async_data; 147 + 148 + /* 149 + * Mark us canceled regardless of ownership. This will prevent a 150 + * potential retry from a spurious wakeup. 151 + */ 152 + atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs); 153 + 154 + /* claim ownership */ 155 + if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) 156 + return false; 157 + 158 + spin_lock_irq(&iw->head->lock); 159 + list_del_init(&iwa->wo.child_wait.entry); 160 + spin_unlock_irq(&iw->head->lock); 161 + io_waitid_complete(req, -ECANCELED); 162 + return true; 163 + } 164 + 165 + int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 166 + unsigned int issue_flags) 167 + { 168 + struct hlist_node *tmp; 169 + struct io_kiocb *req; 170 + int nr = 0; 171 + 172 + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) 173 + return -ENOENT; 174 + 175 + io_ring_submit_lock(ctx, issue_flags); 176 + hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 177 + if (req->cqe.user_data != cd->data && 178 + !(cd->flags & IORING_ASYNC_CANCEL_ANY)) 179 + continue; 180 + if (__io_waitid_cancel(ctx, req)) 181 + nr++; 182 + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) 183 + break; 184 + } 185 + io_ring_submit_unlock(ctx, issue_flags); 186 + 187 + if (nr) 188 + return nr; 189 + 190 + return -ENOENT; 191 + } 192 + 193 + bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, 194 + bool cancel_all) 195 + { 196 + struct hlist_node *tmp; 197 + struct io_kiocb *req; 198 + bool found = false; 199 + 200 + lockdep_assert_held(&ctx->uring_lock); 201 + 202 + hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 203 + if (!io_match_task_safe(req, task, cancel_all)) 204 + continue; 205 + __io_waitid_cancel(ctx, req); 206 + found = true; 207 + } 208 + 209 + return found; 210 + } 211 + 212 + static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) 213 + { 214 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 215 + struct io_waitid_async *iwa = req->async_data; 216 + 217 + if (!atomic_sub_return(1, &iw->refs)) 218 + return false; 219 + 220 + /* 221 + * Wakeup triggered, racing with us. It was prevented from 222 + * completing because of that, queue up the tw to do that. 223 + */ 224 + req->io_task_work.func = io_waitid_cb; 225 + io_req_task_work_add(req); 226 + remove_wait_queue(iw->head, &iwa->wo.child_wait); 227 + return true; 228 + } 229 + 230 + static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) 231 + { 232 + struct io_waitid_async *iwa = req->async_data; 233 + struct io_ring_ctx *ctx = req->ctx; 234 + int ret; 235 + 236 + io_tw_lock(ctx, ts); 237 + 238 + ret = __do_wait(&iwa->wo); 239 + 240 + /* 241 + * If we get -ERESTARTSYS here, we need to re-arm and check again 242 + * to ensure we get another callback. If the retry works, then we can 243 + * just remove ourselves from the waitqueue again and finish the 244 + * request. 245 + */ 246 + if (unlikely(ret == -ERESTARTSYS)) { 247 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 248 + 249 + /* Don't retry if cancel found it meanwhile */ 250 + ret = -ECANCELED; 251 + if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) { 252 + iw->head = &current->signal->wait_chldexit; 253 + add_wait_queue(iw->head, &iwa->wo.child_wait); 254 + ret = __do_wait(&iwa->wo); 255 + if (ret == -ERESTARTSYS) { 256 + /* retry armed, drop our ref */ 257 + io_waitid_drop_issue_ref(req); 258 + return; 259 + } 260 + 261 + remove_wait_queue(iw->head, &iwa->wo.child_wait); 262 + } 263 + } 264 + 265 + io_waitid_complete(req, ret); 266 + } 267 + 268 + static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, 269 + int sync, void *key) 270 + { 271 + struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); 272 + struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo); 273 + struct io_kiocb *req = iwa->req; 274 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 275 + struct task_struct *p = key; 276 + 277 + if (!pid_child_should_wake(wo, p)) 278 + return 0; 279 + 280 + /* cancel is in progress */ 281 + if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) 282 + return 1; 283 + 284 + req->io_task_work.func = io_waitid_cb; 285 + io_req_task_work_add(req); 286 + list_del_init(&wait->entry); 287 + return 1; 288 + } 289 + 290 + int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 291 + { 292 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 293 + 294 + if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) 295 + return -EINVAL; 296 + 297 + iw->which = READ_ONCE(sqe->len); 298 + iw->upid = READ_ONCE(sqe->fd); 299 + iw->options = READ_ONCE(sqe->file_index); 300 + iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 301 + return 0; 302 + } 303 + 304 + int io_waitid(struct io_kiocb *req, unsigned int issue_flags) 305 + { 306 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 307 + struct io_ring_ctx *ctx = req->ctx; 308 + struct io_waitid_async *iwa; 309 + int ret; 310 + 311 + if (io_alloc_async_data(req)) 312 + return -ENOMEM; 313 + 314 + iwa = req->async_data; 315 + iwa->req = req; 316 + 317 + ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, 318 + iw->options, NULL); 319 + if (ret) 320 + goto done; 321 + 322 + /* 323 + * Mark the request as busy upfront, in case we're racing with the 324 + * wakeup. If we are, then we'll notice when we drop this initial 325 + * reference again after arming. 326 + */ 327 + atomic_set(&iw->refs, 1); 328 + 329 + /* 330 + * Cancel must hold the ctx lock, so there's no risk of cancelation 331 + * finding us until a) we remain on the list, and b) the lock is 332 + * dropped. We only need to worry about racing with the wakeup 333 + * callback. 334 + */ 335 + io_ring_submit_lock(ctx, issue_flags); 336 + hlist_add_head(&req->hash_node, &ctx->waitid_list); 337 + 338 + init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); 339 + iwa->wo.child_wait.private = req->task; 340 + iw->head = &current->signal->wait_chldexit; 341 + add_wait_queue(iw->head, &iwa->wo.child_wait); 342 + 343 + ret = __do_wait(&iwa->wo); 344 + if (ret == -ERESTARTSYS) { 345 + /* 346 + * Nobody else grabbed a reference, it'll complete when we get 347 + * a waitqueue callback, or if someone cancels it. 348 + */ 349 + if (!io_waitid_drop_issue_ref(req)) { 350 + io_ring_submit_unlock(ctx, issue_flags); 351 + return IOU_ISSUE_SKIP_COMPLETE; 352 + } 353 + 354 + /* 355 + * Wakeup triggered, racing with us. It was prevented from 356 + * completing because of that, queue up the tw to do that. 357 + */ 358 + io_ring_submit_unlock(ctx, issue_flags); 359 + return IOU_ISSUE_SKIP_COMPLETE; 360 + } 361 + 362 + hlist_del_init(&req->hash_node); 363 + remove_wait_queue(iw->head, &iwa->wo.child_wait); 364 + ret = io_waitid_finish(req, ret); 365 + 366 + io_ring_submit_unlock(ctx, issue_flags); 367 + done: 368 + if (ret < 0) 369 + req_set_fail(req); 370 + io_req_set_res(req, ret, 0); 371 + return IOU_OK; 372 + }
+15
io_uring/waitid.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "../kernel/exit.h" 4 + 5 + struct io_waitid_async { 6 + struct io_kiocb *req; 7 + struct wait_opts wo; 8 + }; 9 + 10 + int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 11 + int io_waitid(struct io_kiocb *req, unsigned int issue_flags); 12 + int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 13 + unsigned int issue_flags); 14 + bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, 15 + bool cancel_all);