Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add io_uring_types.h

This adds definitions of structs that both the core and the various
opcode handlers need to know about.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+499 -488
+3 -488
io_uring/io_uring.c
··· 90 90 #include "../fs/internal.h" 91 91 #include "io-wq.h" 92 92 93 + #include "io_uring_types.h" 94 + 93 95 #define IORING_MAX_ENTRIES 32768 94 96 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 95 97 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 ··· 123 121 #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) 124 122 125 123 #define IO_TCTX_REFS_CACHE_NR (1U << 10) 126 - 127 - struct io_uring { 128 - u32 head ____cacheline_aligned_in_smp; 129 - u32 tail ____cacheline_aligned_in_smp; 130 - }; 131 - 132 - /* 133 - * This data is shared with the application through the mmap at offsets 134 - * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 135 - * 136 - * The offsets to the member fields are published through struct 137 - * io_sqring_offsets when calling io_uring_setup. 138 - */ 139 - struct io_rings { 140 - /* 141 - * Head and tail offsets into the ring; the offsets need to be 142 - * masked to get valid indices. 143 - * 144 - * The kernel controls head of the sq ring and the tail of the cq ring, 145 - * and the application controls tail of the sq ring and the head of the 146 - * cq ring. 147 - */ 148 - struct io_uring sq, cq; 149 - /* 150 - * Bitmasks to apply to head and tail offsets (constant, equals 151 - * ring_entries - 1) 152 - */ 153 - u32 sq_ring_mask, cq_ring_mask; 154 - /* Ring sizes (constant, power of 2) */ 155 - u32 sq_ring_entries, cq_ring_entries; 156 - /* 157 - * Number of invalid entries dropped by the kernel due to 158 - * invalid index stored in array 159 - * 160 - * Written by the kernel, shouldn't be modified by the 161 - * application (i.e. get number of "new events" by comparing to 162 - * cached value). 163 - * 164 - * After a new SQ head value was read by the application this 165 - * counter includes all submissions that were dropped reaching 166 - * the new SQ head (and possibly more). 167 - */ 168 - u32 sq_dropped; 169 - /* 170 - * Runtime SQ flags 171 - * 172 - * Written by the kernel, shouldn't be modified by the 173 - * application. 174 - * 175 - * The application needs a full memory barrier before checking 176 - * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 177 - */ 178 - atomic_t sq_flags; 179 - /* 180 - * Runtime CQ flags 181 - * 182 - * Written by the application, shouldn't be modified by the 183 - * kernel. 184 - */ 185 - u32 cq_flags; 186 - /* 187 - * Number of completion events lost because the queue was full; 188 - * this should be avoided by the application by making sure 189 - * there are not more requests pending than there is space in 190 - * the completion queue. 191 - * 192 - * Written by the kernel, shouldn't be modified by the 193 - * application (i.e. get number of "new events" by comparing to 194 - * cached value). 195 - * 196 - * As completion events come in out of order this counter is not 197 - * ordered with any other data. 198 - */ 199 - u32 cq_overflow; 200 - /* 201 - * Ring buffer of completion events. 202 - * 203 - * The kernel writes completion events fresh every time they are 204 - * produced, so the application is allowed to modify pending 205 - * entries. 206 - */ 207 - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 208 - }; 209 124 210 125 struct io_mapped_ubuf { 211 126 u64 ubuf; ··· 169 250 struct file *file; 170 251 struct io_mapped_ubuf *buf; 171 252 }; 172 - }; 173 - 174 - struct io_file_table { 175 - struct io_fixed_file *files; 176 - unsigned long *bitmap; 177 - unsigned int alloc_hint; 178 253 }; 179 254 180 255 struct io_rsrc_node { ··· 223 310 __u16 bgid; 224 311 }; 225 312 226 - struct io_restriction { 227 - DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 228 - DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 229 - u8 sqe_flags_allowed; 230 - u8 sqe_flags_required; 231 - bool registered; 232 - }; 233 - 234 313 enum { 235 314 IO_SQ_THREAD_SHOULD_STOP = 0, 236 315 IO_SQ_THREAD_SHOULD_PARK, ··· 252 347 #define IO_REQ_CACHE_SIZE 32 253 348 #define IO_REQ_ALLOC_BATCH 8 254 349 255 - struct io_submit_link { 256 - struct io_kiocb *head; 257 - struct io_kiocb *last; 258 - }; 259 - 260 - struct io_submit_state { 261 - /* inline/task_work completion list, under ->uring_lock */ 262 - struct io_wq_work_node free_list; 263 - /* batch completion logic */ 264 - struct io_wq_work_list compl_reqs; 265 - struct io_submit_link link; 266 - 267 - bool plug_started; 268 - bool need_plug; 269 - bool flush_cqes; 270 - unsigned short submit_nr; 271 - struct blk_plug plug; 272 - }; 273 - 274 - struct io_ev_fd { 275 - struct eventfd_ctx *cq_ev_fd; 276 - unsigned int eventfd_async: 1; 277 - struct rcu_head rcu; 278 - }; 279 - 280 - #define BGID_ARRAY 64 281 - 282 - struct io_ring_ctx { 283 - /* const or read-mostly hot data */ 284 - struct { 285 - struct percpu_ref refs; 286 - 287 - struct io_rings *rings; 288 - unsigned int flags; 289 - enum task_work_notify_mode notify_method; 290 - unsigned int compat: 1; 291 - unsigned int drain_next: 1; 292 - unsigned int restricted: 1; 293 - unsigned int off_timeout_used: 1; 294 - unsigned int drain_active: 1; 295 - unsigned int drain_disabled: 1; 296 - unsigned int has_evfd: 1; 297 - unsigned int syscall_iopoll: 1; 298 - } ____cacheline_aligned_in_smp; 299 - 300 - /* submission data */ 301 - struct { 302 - struct mutex uring_lock; 303 - 304 - /* 305 - * Ring buffer of indices into array of io_uring_sqe, which is 306 - * mmapped by the application using the IORING_OFF_SQES offset. 307 - * 308 - * This indirection could e.g. be used to assign fixed 309 - * io_uring_sqe entries to operations and only submit them to 310 - * the queue when needed. 311 - * 312 - * The kernel modifies neither the indices array nor the entries 313 - * array. 314 - */ 315 - u32 *sq_array; 316 - struct io_uring_sqe *sq_sqes; 317 - unsigned cached_sq_head; 318 - unsigned sq_entries; 319 - struct list_head defer_list; 320 - 321 - /* 322 - * Fixed resources fast path, should be accessed only under 323 - * uring_lock, and updated through io_uring_register(2) 324 - */ 325 - struct io_rsrc_node *rsrc_node; 326 - int rsrc_cached_refs; 327 - atomic_t cancel_seq; 328 - struct io_file_table file_table; 329 - unsigned nr_user_files; 330 - unsigned nr_user_bufs; 331 - struct io_mapped_ubuf **user_bufs; 332 - 333 - struct io_submit_state submit_state; 334 - 335 - struct io_buffer_list *io_bl; 336 - struct xarray io_bl_xa; 337 - struct list_head io_buffers_cache; 338 - 339 - struct list_head timeout_list; 340 - struct list_head ltimeout_list; 341 - struct list_head cq_overflow_list; 342 - struct list_head apoll_cache; 343 - struct xarray personalities; 344 - u32 pers_next; 345 - unsigned sq_thread_idle; 346 - } ____cacheline_aligned_in_smp; 347 - 348 - /* IRQ completion list, under ->completion_lock */ 349 - struct io_wq_work_list locked_free_list; 350 - unsigned int locked_free_nr; 351 - 352 - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 353 - struct io_sq_data *sq_data; /* if using sq thread polling */ 354 - 355 - struct wait_queue_head sqo_sq_wait; 356 - struct list_head sqd_list; 357 - 358 - unsigned long check_cq; 359 - 360 - struct { 361 - /* 362 - * We cache a range of free CQEs we can use, once exhausted it 363 - * should go through a slower range setup, see __io_get_cqe() 364 - */ 365 - struct io_uring_cqe *cqe_cached; 366 - struct io_uring_cqe *cqe_sentinel; 367 - 368 - unsigned cached_cq_tail; 369 - unsigned cq_entries; 370 - struct io_ev_fd __rcu *io_ev_fd; 371 - struct wait_queue_head cq_wait; 372 - unsigned cq_extra; 373 - atomic_t cq_timeouts; 374 - unsigned cq_last_tm_flush; 375 - } ____cacheline_aligned_in_smp; 376 - 377 - struct { 378 - spinlock_t completion_lock; 379 - 380 - spinlock_t timeout_lock; 381 - 382 - /* 383 - * ->iopoll_list is protected by the ctx->uring_lock for 384 - * io_uring instances that don't use IORING_SETUP_SQPOLL. 385 - * For SQPOLL, only the single threaded io_sq_thread() will 386 - * manipulate the list, hence no extra locking is needed there. 387 - */ 388 - struct io_wq_work_list iopoll_list; 389 - struct hlist_head *cancel_hash; 390 - unsigned cancel_hash_bits; 391 - bool poll_multi_queue; 392 - 393 - struct list_head io_buffers_comp; 394 - } ____cacheline_aligned_in_smp; 395 - 396 - struct io_restriction restrictions; 397 - 398 - /* slow path rsrc auxilary data, used by update/register */ 399 - struct { 400 - struct io_rsrc_node *rsrc_backup_node; 401 - struct io_mapped_ubuf *dummy_ubuf; 402 - struct io_rsrc_data *file_data; 403 - struct io_rsrc_data *buf_data; 404 - 405 - struct delayed_work rsrc_put_work; 406 - struct llist_head rsrc_put_llist; 407 - struct list_head rsrc_ref_list; 408 - spinlock_t rsrc_ref_lock; 409 - 410 - struct list_head io_buffers_pages; 411 - }; 412 - 413 - /* Keep this last, we don't need it for the fast path */ 414 - struct { 415 - #if defined(CONFIG_UNIX) 416 - struct socket *ring_sock; 417 - #endif 418 - /* hashed buffered write serialization */ 419 - struct io_wq_hash *hash_map; 420 - 421 - /* Only used for accounting purposes */ 422 - struct user_struct *user; 423 - struct mm_struct *mm_account; 424 - 425 - /* ctx exit and cancelation */ 426 - struct llist_head fallback_llist; 427 - struct delayed_work fallback_work; 428 - struct work_struct exit_work; 429 - struct list_head tctx_list; 430 - struct completion ref_comp; 431 - u32 iowq_limits[2]; 432 - bool iowq_limits_set; 433 - }; 434 - }; 350 + #define BGID_ARRAY 64 435 351 436 352 /* 437 353 * Arbitrary limit, can be raised if need be ··· 534 808 struct filename *filename; 535 809 }; 536 810 537 - enum { 538 - REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 539 - REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 540 - REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 541 - REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 542 - REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 543 - REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 544 - REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, 545 - 546 - /* first byte is taken by user flags, shift it to not overlap */ 547 - REQ_F_FAIL_BIT = 8, 548 - REQ_F_INFLIGHT_BIT, 549 - REQ_F_CUR_POS_BIT, 550 - REQ_F_NOWAIT_BIT, 551 - REQ_F_LINK_TIMEOUT_BIT, 552 - REQ_F_NEED_CLEANUP_BIT, 553 - REQ_F_POLLED_BIT, 554 - REQ_F_BUFFER_SELECTED_BIT, 555 - REQ_F_BUFFER_RING_BIT, 556 - REQ_F_COMPLETE_INLINE_BIT, 557 - REQ_F_REISSUE_BIT, 558 - REQ_F_CREDS_BIT, 559 - REQ_F_REFCOUNT_BIT, 560 - REQ_F_ARM_LTIMEOUT_BIT, 561 - REQ_F_ASYNC_DATA_BIT, 562 - REQ_F_SKIP_LINK_CQES_BIT, 563 - REQ_F_SINGLE_POLL_BIT, 564 - REQ_F_DOUBLE_POLL_BIT, 565 - REQ_F_PARTIAL_IO_BIT, 566 - REQ_F_CQE32_INIT_BIT, 567 - REQ_F_APOLL_MULTISHOT_BIT, 568 - REQ_F_CLEAR_POLLIN_BIT, 569 - /* keep async read/write and isreg together and in order */ 570 - REQ_F_SUPPORT_NOWAIT_BIT, 571 - REQ_F_ISREG_BIT, 572 - 573 - /* not a real bit, just to check we're not overflowing the space */ 574 - __REQ_F_LAST_BIT, 575 - }; 576 - 577 - enum { 578 - /* ctx owns file */ 579 - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 580 - /* drain existing IO first */ 581 - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 582 - /* linked sqes */ 583 - REQ_F_LINK = BIT(REQ_F_LINK_BIT), 584 - /* doesn't sever on completion < 0 */ 585 - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 586 - /* IOSQE_ASYNC */ 587 - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 588 - /* IOSQE_BUFFER_SELECT */ 589 - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 590 - /* IOSQE_CQE_SKIP_SUCCESS */ 591 - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), 592 - 593 - /* fail rest of links */ 594 - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 595 - /* on inflight list, should be cancelled and waited on exit reliably */ 596 - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 597 - /* read/write uses file position */ 598 - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 599 - /* must not punt to workers */ 600 - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 601 - /* has or had linked timeout */ 602 - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 603 - /* needs cleanup */ 604 - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 605 - /* already went through poll handler */ 606 - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 607 - /* buffer already selected */ 608 - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 609 - /* buffer selected from ring, needs commit */ 610 - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), 611 - /* completion is deferred through io_comp_state */ 612 - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 613 - /* caller should reissue async */ 614 - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 615 - /* supports async reads/writes */ 616 - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), 617 - /* regular file */ 618 - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 619 - /* has creds assigned */ 620 - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 621 - /* skip refcounting if not set */ 622 - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 623 - /* there is a linked timeout that has to be armed */ 624 - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 625 - /* ->async_data allocated */ 626 - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), 627 - /* don't post CQEs while failing linked requests */ 628 - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), 629 - /* single poll may be active */ 630 - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), 631 - /* double poll may active */ 632 - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), 633 - /* request has already done partial IO */ 634 - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 635 - /* fast poll multishot mode */ 636 - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), 637 - /* ->extra1 and ->extra2 are initialised */ 638 - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), 639 - /* recvmsg special flag, clear EPOLLIN */ 640 - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), 641 - }; 642 - 643 811 struct async_poll { 644 812 struct io_poll poll; 645 813 struct io_poll *double_poll; 646 - }; 647 - 648 - typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 649 - 650 - struct io_task_work { 651 - union { 652 - struct io_wq_work_node node; 653 - struct llist_node fallback_node; 654 - }; 655 - io_req_tw_func_t func; 656 814 }; 657 815 658 816 enum { ··· 544 934 IORING_RSRC_BUFFER = 1, 545 935 }; 546 936 547 - struct io_cqe { 548 - __u64 user_data; 549 - __s32 res; 550 - /* fd initially, then cflags for completion */ 551 - union { 552 - __u32 flags; 553 - int fd; 554 - }; 555 - }; 556 - 557 937 enum { 558 938 IO_CHECK_CQ_OVERFLOW_BIT, 559 939 IO_CHECK_CQ_DROPPED_BIT, 560 - }; 561 - 562 - /* 563 - * Each request type overlays its private data structure on top of this one. 564 - * They must not exceed this one in size. 565 - */ 566 - struct io_cmd_data { 567 - struct file *file; 568 - /* each command gets 56 bytes of data */ 569 - __u8 data[56]; 570 - }; 571 - 572 - #define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) 573 - #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) 574 - 575 - struct io_kiocb { 576 - union { 577 - /* 578 - * NOTE! Each of the io_kiocb union members has the file pointer 579 - * as the first entry in their struct definition. So you can 580 - * access the file pointer through any of the sub-structs, 581 - * or directly as just 'file' in this struct. 582 - */ 583 - struct file *file; 584 - struct io_cmd_data cmd; 585 - }; 586 - 587 - u8 opcode; 588 - /* polled IO has completed */ 589 - u8 iopoll_completed; 590 - /* 591 - * Can be either a fixed buffer index, or used with provided buffers. 592 - * For the latter, before issue it points to the buffer group ID, 593 - * and after selection it points to the buffer ID itself. 594 - */ 595 - u16 buf_index; 596 - unsigned int flags; 597 - 598 - struct io_cqe cqe; 599 - 600 - struct io_ring_ctx *ctx; 601 - struct task_struct *task; 602 - 603 - struct io_rsrc_node *rsrc_node; 604 - 605 - union { 606 - /* store used ubuf, so we can prevent reloading */ 607 - struct io_mapped_ubuf *imu; 608 - 609 - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 610 - struct io_buffer *kbuf; 611 - 612 - /* 613 - * stores buffer ID for ring provided buffers, valid IFF 614 - * REQ_F_BUFFER_RING is set. 615 - */ 616 - struct io_buffer_list *buf_list; 617 - }; 618 - 619 - union { 620 - /* used by request caches, completion batching and iopoll */ 621 - struct io_wq_work_node comp_list; 622 - /* cache ->apoll->events */ 623 - __poll_t apoll_events; 624 - }; 625 - atomic_t refs; 626 - atomic_t poll_refs; 627 - struct io_task_work io_task_work; 628 - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 629 - union { 630 - struct hlist_node hash_node; 631 - struct { 632 - u64 extra1; 633 - u64 extra2; 634 - }; 635 - }; 636 - /* internal polling, see IORING_FEAT_FAST_POLL */ 637 - struct async_poll *apoll; 638 - /* opcode allocated if it needs to store data for async defer */ 639 - void *async_data; 640 - /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ 641 - struct io_kiocb *link; 642 - /* custom credentials, valid IFF REQ_F_CREDS is set */ 643 - const struct cred *creds; 644 - struct io_wq_work work; 645 940 }; 646 941 647 942 struct io_tctx_node {
+496
io_uring/io_uring_types.h
··· 1 + #ifndef IO_URING_TYPES_H 2 + #define IO_URING_TYPES_H 3 + 4 + #include <linux/blkdev.h> 5 + #include <linux/task_work.h> 6 + 7 + #include "io-wq.h" 8 + 9 + struct io_uring { 10 + u32 head ____cacheline_aligned_in_smp; 11 + u32 tail ____cacheline_aligned_in_smp; 12 + }; 13 + 14 + /* 15 + * This data is shared with the application through the mmap at offsets 16 + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 17 + * 18 + * The offsets to the member fields are published through struct 19 + * io_sqring_offsets when calling io_uring_setup. 20 + */ 21 + struct io_rings { 22 + /* 23 + * Head and tail offsets into the ring; the offsets need to be 24 + * masked to get valid indices. 25 + * 26 + * The kernel controls head of the sq ring and the tail of the cq ring, 27 + * and the application controls tail of the sq ring and the head of the 28 + * cq ring. 29 + */ 30 + struct io_uring sq, cq; 31 + /* 32 + * Bitmasks to apply to head and tail offsets (constant, equals 33 + * ring_entries - 1) 34 + */ 35 + u32 sq_ring_mask, cq_ring_mask; 36 + /* Ring sizes (constant, power of 2) */ 37 + u32 sq_ring_entries, cq_ring_entries; 38 + /* 39 + * Number of invalid entries dropped by the kernel due to 40 + * invalid index stored in array 41 + * 42 + * Written by the kernel, shouldn't be modified by the 43 + * application (i.e. get number of "new events" by comparing to 44 + * cached value). 45 + * 46 + * After a new SQ head value was read by the application this 47 + * counter includes all submissions that were dropped reaching 48 + * the new SQ head (and possibly more). 49 + */ 50 + u32 sq_dropped; 51 + /* 52 + * Runtime SQ flags 53 + * 54 + * Written by the kernel, shouldn't be modified by the 55 + * application. 56 + * 57 + * The application needs a full memory barrier before checking 58 + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 59 + */ 60 + atomic_t sq_flags; 61 + /* 62 + * Runtime CQ flags 63 + * 64 + * Written by the application, shouldn't be modified by the 65 + * kernel. 66 + */ 67 + u32 cq_flags; 68 + /* 69 + * Number of completion events lost because the queue was full; 70 + * this should be avoided by the application by making sure 71 + * there are not more requests pending than there is space in 72 + * the completion queue. 73 + * 74 + * Written by the kernel, shouldn't be modified by the 75 + * application (i.e. get number of "new events" by comparing to 76 + * cached value). 77 + * 78 + * As completion events come in out of order this counter is not 79 + * ordered with any other data. 80 + */ 81 + u32 cq_overflow; 82 + /* 83 + * Ring buffer of completion events. 84 + * 85 + * The kernel writes completion events fresh every time they are 86 + * produced, so the application is allowed to modify pending 87 + * entries. 88 + */ 89 + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 90 + }; 91 + 92 + struct io_restriction { 93 + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 94 + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 95 + u8 sqe_flags_allowed; 96 + u8 sqe_flags_required; 97 + bool registered; 98 + }; 99 + 100 + struct io_submit_link { 101 + struct io_kiocb *head; 102 + struct io_kiocb *last; 103 + }; 104 + 105 + struct io_submit_state { 106 + /* inline/task_work completion list, under ->uring_lock */ 107 + struct io_wq_work_node free_list; 108 + /* batch completion logic */ 109 + struct io_wq_work_list compl_reqs; 110 + struct io_submit_link link; 111 + 112 + bool plug_started; 113 + bool need_plug; 114 + bool flush_cqes; 115 + unsigned short submit_nr; 116 + struct blk_plug plug; 117 + }; 118 + 119 + struct io_ev_fd { 120 + struct eventfd_ctx *cq_ev_fd; 121 + unsigned int eventfd_async: 1; 122 + struct rcu_head rcu; 123 + }; 124 + 125 + struct io_file_table { 126 + struct io_fixed_file *files; 127 + unsigned long *bitmap; 128 + unsigned int alloc_hint; 129 + }; 130 + 131 + struct io_ring_ctx { 132 + /* const or read-mostly hot data */ 133 + struct { 134 + struct percpu_ref refs; 135 + 136 + struct io_rings *rings; 137 + unsigned int flags; 138 + enum task_work_notify_mode notify_method; 139 + unsigned int compat: 1; 140 + unsigned int drain_next: 1; 141 + unsigned int restricted: 1; 142 + unsigned int off_timeout_used: 1; 143 + unsigned int drain_active: 1; 144 + unsigned int drain_disabled: 1; 145 + unsigned int has_evfd: 1; 146 + unsigned int syscall_iopoll: 1; 147 + } ____cacheline_aligned_in_smp; 148 + 149 + /* submission data */ 150 + struct { 151 + struct mutex uring_lock; 152 + 153 + /* 154 + * Ring buffer of indices into array of io_uring_sqe, which is 155 + * mmapped by the application using the IORING_OFF_SQES offset. 156 + * 157 + * This indirection could e.g. be used to assign fixed 158 + * io_uring_sqe entries to operations and only submit them to 159 + * the queue when needed. 160 + * 161 + * The kernel modifies neither the indices array nor the entries 162 + * array. 163 + */ 164 + u32 *sq_array; 165 + struct io_uring_sqe *sq_sqes; 166 + unsigned cached_sq_head; 167 + unsigned sq_entries; 168 + struct list_head defer_list; 169 + 170 + /* 171 + * Fixed resources fast path, should be accessed only under 172 + * uring_lock, and updated through io_uring_register(2) 173 + */ 174 + struct io_rsrc_node *rsrc_node; 175 + int rsrc_cached_refs; 176 + atomic_t cancel_seq; 177 + struct io_file_table file_table; 178 + unsigned nr_user_files; 179 + unsigned nr_user_bufs; 180 + struct io_mapped_ubuf **user_bufs; 181 + 182 + struct io_submit_state submit_state; 183 + 184 + struct io_buffer_list *io_bl; 185 + struct xarray io_bl_xa; 186 + struct list_head io_buffers_cache; 187 + 188 + struct list_head timeout_list; 189 + struct list_head ltimeout_list; 190 + struct list_head cq_overflow_list; 191 + struct list_head apoll_cache; 192 + struct xarray personalities; 193 + u32 pers_next; 194 + unsigned sq_thread_idle; 195 + } ____cacheline_aligned_in_smp; 196 + 197 + /* IRQ completion list, under ->completion_lock */ 198 + struct io_wq_work_list locked_free_list; 199 + unsigned int locked_free_nr; 200 + 201 + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 202 + struct io_sq_data *sq_data; /* if using sq thread polling */ 203 + 204 + struct wait_queue_head sqo_sq_wait; 205 + struct list_head sqd_list; 206 + 207 + unsigned long check_cq; 208 + 209 + struct { 210 + /* 211 + * We cache a range of free CQEs we can use, once exhausted it 212 + * should go through a slower range setup, see __io_get_cqe() 213 + */ 214 + struct io_uring_cqe *cqe_cached; 215 + struct io_uring_cqe *cqe_sentinel; 216 + 217 + unsigned cached_cq_tail; 218 + unsigned cq_entries; 219 + struct io_ev_fd __rcu *io_ev_fd; 220 + struct wait_queue_head cq_wait; 221 + unsigned cq_extra; 222 + atomic_t cq_timeouts; 223 + unsigned cq_last_tm_flush; 224 + } ____cacheline_aligned_in_smp; 225 + 226 + struct { 227 + spinlock_t completion_lock; 228 + 229 + spinlock_t timeout_lock; 230 + 231 + /* 232 + * ->iopoll_list is protected by the ctx->uring_lock for 233 + * io_uring instances that don't use IORING_SETUP_SQPOLL. 234 + * For SQPOLL, only the single threaded io_sq_thread() will 235 + * manipulate the list, hence no extra locking is needed there. 236 + */ 237 + struct io_wq_work_list iopoll_list; 238 + struct hlist_head *cancel_hash; 239 + unsigned cancel_hash_bits; 240 + bool poll_multi_queue; 241 + 242 + struct list_head io_buffers_comp; 243 + } ____cacheline_aligned_in_smp; 244 + 245 + struct io_restriction restrictions; 246 + 247 + /* slow path rsrc auxilary data, used by update/register */ 248 + struct { 249 + struct io_rsrc_node *rsrc_backup_node; 250 + struct io_mapped_ubuf *dummy_ubuf; 251 + struct io_rsrc_data *file_data; 252 + struct io_rsrc_data *buf_data; 253 + 254 + struct delayed_work rsrc_put_work; 255 + struct llist_head rsrc_put_llist; 256 + struct list_head rsrc_ref_list; 257 + spinlock_t rsrc_ref_lock; 258 + 259 + struct list_head io_buffers_pages; 260 + }; 261 + 262 + /* Keep this last, we don't need it for the fast path */ 263 + struct { 264 + #if defined(CONFIG_UNIX) 265 + struct socket *ring_sock; 266 + #endif 267 + /* hashed buffered write serialization */ 268 + struct io_wq_hash *hash_map; 269 + 270 + /* Only used for accounting purposes */ 271 + struct user_struct *user; 272 + struct mm_struct *mm_account; 273 + 274 + /* ctx exit and cancelation */ 275 + struct llist_head fallback_llist; 276 + struct delayed_work fallback_work; 277 + struct work_struct exit_work; 278 + struct list_head tctx_list; 279 + struct completion ref_comp; 280 + u32 iowq_limits[2]; 281 + bool iowq_limits_set; 282 + }; 283 + }; 284 + 285 + enum { 286 + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 287 + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 288 + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 289 + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 290 + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 291 + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 292 + REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, 293 + 294 + /* first byte is taken by user flags, shift it to not overlap */ 295 + REQ_F_FAIL_BIT = 8, 296 + REQ_F_INFLIGHT_BIT, 297 + REQ_F_CUR_POS_BIT, 298 + REQ_F_NOWAIT_BIT, 299 + REQ_F_LINK_TIMEOUT_BIT, 300 + REQ_F_NEED_CLEANUP_BIT, 301 + REQ_F_POLLED_BIT, 302 + REQ_F_BUFFER_SELECTED_BIT, 303 + REQ_F_BUFFER_RING_BIT, 304 + REQ_F_COMPLETE_INLINE_BIT, 305 + REQ_F_REISSUE_BIT, 306 + REQ_F_CREDS_BIT, 307 + REQ_F_REFCOUNT_BIT, 308 + REQ_F_ARM_LTIMEOUT_BIT, 309 + REQ_F_ASYNC_DATA_BIT, 310 + REQ_F_SKIP_LINK_CQES_BIT, 311 + REQ_F_SINGLE_POLL_BIT, 312 + REQ_F_DOUBLE_POLL_BIT, 313 + REQ_F_PARTIAL_IO_BIT, 314 + REQ_F_CQE32_INIT_BIT, 315 + REQ_F_APOLL_MULTISHOT_BIT, 316 + REQ_F_CLEAR_POLLIN_BIT, 317 + /* keep async read/write and isreg together and in order */ 318 + REQ_F_SUPPORT_NOWAIT_BIT, 319 + REQ_F_ISREG_BIT, 320 + 321 + /* not a real bit, just to check we're not overflowing the space */ 322 + __REQ_F_LAST_BIT, 323 + }; 324 + 325 + enum { 326 + /* ctx owns file */ 327 + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 328 + /* drain existing IO first */ 329 + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 330 + /* linked sqes */ 331 + REQ_F_LINK = BIT(REQ_F_LINK_BIT), 332 + /* doesn't sever on completion < 0 */ 333 + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 334 + /* IOSQE_ASYNC */ 335 + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 336 + /* IOSQE_BUFFER_SELECT */ 337 + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 338 + /* IOSQE_CQE_SKIP_SUCCESS */ 339 + REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), 340 + 341 + /* fail rest of links */ 342 + REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 343 + /* on inflight list, should be cancelled and waited on exit reliably */ 344 + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 345 + /* read/write uses file position */ 346 + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 347 + /* must not punt to workers */ 348 + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 349 + /* has or had linked timeout */ 350 + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 351 + /* needs cleanup */ 352 + REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 353 + /* already went through poll handler */ 354 + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 355 + /* buffer already selected */ 356 + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 357 + /* buffer selected from ring, needs commit */ 358 + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), 359 + /* completion is deferred through io_comp_state */ 360 + REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 361 + /* caller should reissue async */ 362 + REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 363 + /* supports async reads/writes */ 364 + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), 365 + /* regular file */ 366 + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 367 + /* has creds assigned */ 368 + REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 369 + /* skip refcounting if not set */ 370 + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 371 + /* there is a linked timeout that has to be armed */ 372 + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 373 + /* ->async_data allocated */ 374 + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), 375 + /* don't post CQEs while failing linked requests */ 376 + REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), 377 + /* single poll may be active */ 378 + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), 379 + /* double poll may active */ 380 + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), 381 + /* request has already done partial IO */ 382 + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 383 + /* fast poll multishot mode */ 384 + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), 385 + /* ->extra1 and ->extra2 are initialised */ 386 + REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), 387 + /* recvmsg special flag, clear EPOLLIN */ 388 + REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), 389 + }; 390 + 391 + typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 392 + 393 + struct io_task_work { 394 + union { 395 + struct io_wq_work_node node; 396 + struct llist_node fallback_node; 397 + }; 398 + io_req_tw_func_t func; 399 + }; 400 + 401 + struct io_cqe { 402 + __u64 user_data; 403 + __s32 res; 404 + /* fd initially, then cflags for completion */ 405 + union { 406 + __u32 flags; 407 + int fd; 408 + }; 409 + }; 410 + 411 + /* 412 + * Each request type overlays its private data structure on top of this one. 413 + * They must not exceed this one in size. 414 + */ 415 + struct io_cmd_data { 416 + struct file *file; 417 + /* each command gets 56 bytes of data */ 418 + __u8 data[56]; 419 + }; 420 + 421 + #define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) 422 + #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) 423 + 424 + struct io_kiocb { 425 + union { 426 + /* 427 + * NOTE! Each of the io_kiocb union members has the file pointer 428 + * as the first entry in their struct definition. So you can 429 + * access the file pointer through any of the sub-structs, 430 + * or directly as just 'file' in this struct. 431 + */ 432 + struct file *file; 433 + struct io_cmd_data cmd; 434 + }; 435 + 436 + u8 opcode; 437 + /* polled IO has completed */ 438 + u8 iopoll_completed; 439 + /* 440 + * Can be either a fixed buffer index, or used with provided buffers. 441 + * For the latter, before issue it points to the buffer group ID, 442 + * and after selection it points to the buffer ID itself. 443 + */ 444 + u16 buf_index; 445 + unsigned int flags; 446 + 447 + struct io_cqe cqe; 448 + 449 + struct io_ring_ctx *ctx; 450 + struct task_struct *task; 451 + 452 + struct io_rsrc_node *rsrc_node; 453 + 454 + union { 455 + /* store used ubuf, so we can prevent reloading */ 456 + struct io_mapped_ubuf *imu; 457 + 458 + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 459 + struct io_buffer *kbuf; 460 + 461 + /* 462 + * stores buffer ID for ring provided buffers, valid IFF 463 + * REQ_F_BUFFER_RING is set. 464 + */ 465 + struct io_buffer_list *buf_list; 466 + }; 467 + 468 + union { 469 + /* used by request caches, completion batching and iopoll */ 470 + struct io_wq_work_node comp_list; 471 + /* cache ->apoll->events */ 472 + __poll_t apoll_events; 473 + }; 474 + atomic_t refs; 475 + atomic_t poll_refs; 476 + struct io_task_work io_task_work; 477 + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 478 + union { 479 + struct hlist_node hash_node; 480 + struct { 481 + u64 extra1; 482 + u64 extra2; 483 + }; 484 + }; 485 + /* internal polling, see IORING_FEAT_FAST_POLL */ 486 + struct async_poll *apoll; 487 + /* opcode allocated if it needs to store data for async defer */ 488 + void *async_data; 489 + /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ 490 + struct io_kiocb *link; 491 + /* custom credentials, valid IFF REQ_F_CREDS is set */ 492 + const struct cred *creds; 493 + struct io_wq_work work; 494 + }; 495 + 496 + #endif