···9090#include "../fs/internal.h"9191#include "io-wq.h"92929393+#include "io_uring_types.h"9494+9395#define IORING_MAX_ENTRIES 327689496#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)9597#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8···123121#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)124122125123#define IO_TCTX_REFS_CACHE_NR (1U << 10)126126-127127-struct io_uring {128128- u32 head ____cacheline_aligned_in_smp;129129- u32 tail ____cacheline_aligned_in_smp;130130-};131131-132132-/*133133- * This data is shared with the application through the mmap at offsets134134- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.135135- *136136- * The offsets to the member fields are published through struct137137- * io_sqring_offsets when calling io_uring_setup.138138- */139139-struct io_rings {140140- /*141141- * Head and tail offsets into the ring; the offsets need to be142142- * masked to get valid indices.143143- *144144- * The kernel controls head of the sq ring and the tail of the cq ring,145145- * and the application controls tail of the sq ring and the head of the146146- * cq ring.147147- */148148- struct io_uring sq, cq;149149- /*150150- * Bitmasks to apply to head and tail offsets (constant, equals151151- * ring_entries - 1)152152- */153153- u32 sq_ring_mask, cq_ring_mask;154154- /* Ring sizes (constant, power of 2) */155155- u32 sq_ring_entries, cq_ring_entries;156156- /*157157- * Number of invalid entries dropped by the kernel due to158158- * invalid index stored in array159159- *160160- * Written by the kernel, shouldn't be modified by the161161- * application (i.e. get number of "new events" by comparing to162162- * cached value).163163- *164164- * After a new SQ head value was read by the application this165165- * counter includes all submissions that were dropped reaching166166- * the new SQ head (and possibly more).167167- */168168- u32 sq_dropped;169169- /*170170- * Runtime SQ flags171171- *172172- * Written by the kernel, shouldn't be modified by the173173- * application.174174- *175175- * The application needs a full memory barrier before checking176176- * for IORING_SQ_NEED_WAKEUP after updating the sq tail.177177- */178178- atomic_t sq_flags;179179- /*180180- * Runtime CQ flags181181- *182182- * Written by the application, shouldn't be modified by the183183- * kernel.184184- */185185- u32 cq_flags;186186- /*187187- * Number of completion events lost because the queue was full;188188- * this should be avoided by the application by making sure189189- * there are not more requests pending than there is space in190190- * the completion queue.191191- *192192- * Written by the kernel, shouldn't be modified by the193193- * application (i.e. get number of "new events" by comparing to194194- * cached value).195195- *196196- * As completion events come in out of order this counter is not197197- * ordered with any other data.198198- */199199- u32 cq_overflow;200200- /*201201- * Ring buffer of completion events.202202- *203203- * The kernel writes completion events fresh every time they are204204- * produced, so the application is allowed to modify pending205205- * entries.206206- */207207- struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;208208-};209124210125struct io_mapped_ubuf {211126 u64 ubuf;···169250 struct file *file;170251 struct io_mapped_ubuf *buf;171252 };172172-};173173-174174-struct io_file_table {175175- struct io_fixed_file *files;176176- unsigned long *bitmap;177177- unsigned int alloc_hint;178253};179254180255struct io_rsrc_node {···223310 __u16 bgid;224311};225312226226-struct io_restriction {227227- DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);228228- DECLARE_BITMAP(sqe_op, IORING_OP_LAST);229229- u8 sqe_flags_allowed;230230- u8 sqe_flags_required;231231- bool registered;232232-};233233-234313enum {235314 IO_SQ_THREAD_SHOULD_STOP = 0,236315 IO_SQ_THREAD_SHOULD_PARK,···252347#define IO_REQ_CACHE_SIZE 32253348#define IO_REQ_ALLOC_BATCH 8254349255255-struct io_submit_link {256256- struct io_kiocb *head;257257- struct io_kiocb *last;258258-};259259-260260-struct io_submit_state {261261- /* inline/task_work completion list, under ->uring_lock */262262- struct io_wq_work_node free_list;263263- /* batch completion logic */264264- struct io_wq_work_list compl_reqs;265265- struct io_submit_link link;266266-267267- bool plug_started;268268- bool need_plug;269269- bool flush_cqes;270270- unsigned short submit_nr;271271- struct blk_plug plug;272272-};273273-274274-struct io_ev_fd {275275- struct eventfd_ctx *cq_ev_fd;276276- unsigned int eventfd_async: 1;277277- struct rcu_head rcu;278278-};279279-280280-#define BGID_ARRAY 64281281-282282-struct io_ring_ctx {283283- /* const or read-mostly hot data */284284- struct {285285- struct percpu_ref refs;286286-287287- struct io_rings *rings;288288- unsigned int flags;289289- enum task_work_notify_mode notify_method;290290- unsigned int compat: 1;291291- unsigned int drain_next: 1;292292- unsigned int restricted: 1;293293- unsigned int off_timeout_used: 1;294294- unsigned int drain_active: 1;295295- unsigned int drain_disabled: 1;296296- unsigned int has_evfd: 1;297297- unsigned int syscall_iopoll: 1;298298- } ____cacheline_aligned_in_smp;299299-300300- /* submission data */301301- struct {302302- struct mutex uring_lock;303303-304304- /*305305- * Ring buffer of indices into array of io_uring_sqe, which is306306- * mmapped by the application using the IORING_OFF_SQES offset.307307- *308308- * This indirection could e.g. be used to assign fixed309309- * io_uring_sqe entries to operations and only submit them to310310- * the queue when needed.311311- *312312- * The kernel modifies neither the indices array nor the entries313313- * array.314314- */315315- u32 *sq_array;316316- struct io_uring_sqe *sq_sqes;317317- unsigned cached_sq_head;318318- unsigned sq_entries;319319- struct list_head defer_list;320320-321321- /*322322- * Fixed resources fast path, should be accessed only under323323- * uring_lock, and updated through io_uring_register(2)324324- */325325- struct io_rsrc_node *rsrc_node;326326- int rsrc_cached_refs;327327- atomic_t cancel_seq;328328- struct io_file_table file_table;329329- unsigned nr_user_files;330330- unsigned nr_user_bufs;331331- struct io_mapped_ubuf **user_bufs;332332-333333- struct io_submit_state submit_state;334334-335335- struct io_buffer_list *io_bl;336336- struct xarray io_bl_xa;337337- struct list_head io_buffers_cache;338338-339339- struct list_head timeout_list;340340- struct list_head ltimeout_list;341341- struct list_head cq_overflow_list;342342- struct list_head apoll_cache;343343- struct xarray personalities;344344- u32 pers_next;345345- unsigned sq_thread_idle;346346- } ____cacheline_aligned_in_smp;347347-348348- /* IRQ completion list, under ->completion_lock */349349- struct io_wq_work_list locked_free_list;350350- unsigned int locked_free_nr;351351-352352- const struct cred *sq_creds; /* cred used for __io_sq_thread() */353353- struct io_sq_data *sq_data; /* if using sq thread polling */354354-355355- struct wait_queue_head sqo_sq_wait;356356- struct list_head sqd_list;357357-358358- unsigned long check_cq;359359-360360- struct {361361- /*362362- * We cache a range of free CQEs we can use, once exhausted it363363- * should go through a slower range setup, see __io_get_cqe()364364- */365365- struct io_uring_cqe *cqe_cached;366366- struct io_uring_cqe *cqe_sentinel;367367-368368- unsigned cached_cq_tail;369369- unsigned cq_entries;370370- struct io_ev_fd __rcu *io_ev_fd;371371- struct wait_queue_head cq_wait;372372- unsigned cq_extra;373373- atomic_t cq_timeouts;374374- unsigned cq_last_tm_flush;375375- } ____cacheline_aligned_in_smp;376376-377377- struct {378378- spinlock_t completion_lock;379379-380380- spinlock_t timeout_lock;381381-382382- /*383383- * ->iopoll_list is protected by the ctx->uring_lock for384384- * io_uring instances that don't use IORING_SETUP_SQPOLL.385385- * For SQPOLL, only the single threaded io_sq_thread() will386386- * manipulate the list, hence no extra locking is needed there.387387- */388388- struct io_wq_work_list iopoll_list;389389- struct hlist_head *cancel_hash;390390- unsigned cancel_hash_bits;391391- bool poll_multi_queue;392392-393393- struct list_head io_buffers_comp;394394- } ____cacheline_aligned_in_smp;395395-396396- struct io_restriction restrictions;397397-398398- /* slow path rsrc auxilary data, used by update/register */399399- struct {400400- struct io_rsrc_node *rsrc_backup_node;401401- struct io_mapped_ubuf *dummy_ubuf;402402- struct io_rsrc_data *file_data;403403- struct io_rsrc_data *buf_data;404404-405405- struct delayed_work rsrc_put_work;406406- struct llist_head rsrc_put_llist;407407- struct list_head rsrc_ref_list;408408- spinlock_t rsrc_ref_lock;409409-410410- struct list_head io_buffers_pages;411411- };412412-413413- /* Keep this last, we don't need it for the fast path */414414- struct {415415- #if defined(CONFIG_UNIX)416416- struct socket *ring_sock;417417- #endif418418- /* hashed buffered write serialization */419419- struct io_wq_hash *hash_map;420420-421421- /* Only used for accounting purposes */422422- struct user_struct *user;423423- struct mm_struct *mm_account;424424-425425- /* ctx exit and cancelation */426426- struct llist_head fallback_llist;427427- struct delayed_work fallback_work;428428- struct work_struct exit_work;429429- struct list_head tctx_list;430430- struct completion ref_comp;431431- u32 iowq_limits[2];432432- bool iowq_limits_set;433433- };434434-};350350+#define BGID_ARRAY 64435351436352/*437353 * Arbitrary limit, can be raised if need be···534808 struct filename *filename;535809};536810537537-enum {538538- REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,539539- REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,540540- REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,541541- REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,542542- REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,543543- REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,544544- REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,545545-546546- /* first byte is taken by user flags, shift it to not overlap */547547- REQ_F_FAIL_BIT = 8,548548- REQ_F_INFLIGHT_BIT,549549- REQ_F_CUR_POS_BIT,550550- REQ_F_NOWAIT_BIT,551551- REQ_F_LINK_TIMEOUT_BIT,552552- REQ_F_NEED_CLEANUP_BIT,553553- REQ_F_POLLED_BIT,554554- REQ_F_BUFFER_SELECTED_BIT,555555- REQ_F_BUFFER_RING_BIT,556556- REQ_F_COMPLETE_INLINE_BIT,557557- REQ_F_REISSUE_BIT,558558- REQ_F_CREDS_BIT,559559- REQ_F_REFCOUNT_BIT,560560- REQ_F_ARM_LTIMEOUT_BIT,561561- REQ_F_ASYNC_DATA_BIT,562562- REQ_F_SKIP_LINK_CQES_BIT,563563- REQ_F_SINGLE_POLL_BIT,564564- REQ_F_DOUBLE_POLL_BIT,565565- REQ_F_PARTIAL_IO_BIT,566566- REQ_F_CQE32_INIT_BIT,567567- REQ_F_APOLL_MULTISHOT_BIT,568568- REQ_F_CLEAR_POLLIN_BIT,569569- /* keep async read/write and isreg together and in order */570570- REQ_F_SUPPORT_NOWAIT_BIT,571571- REQ_F_ISREG_BIT,572572-573573- /* not a real bit, just to check we're not overflowing the space */574574- __REQ_F_LAST_BIT,575575-};576576-577577-enum {578578- /* ctx owns file */579579- REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),580580- /* drain existing IO first */581581- REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),582582- /* linked sqes */583583- REQ_F_LINK = BIT(REQ_F_LINK_BIT),584584- /* doesn't sever on completion < 0 */585585- REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),586586- /* IOSQE_ASYNC */587587- REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),588588- /* IOSQE_BUFFER_SELECT */589589- REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),590590- /* IOSQE_CQE_SKIP_SUCCESS */591591- REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),592592-593593- /* fail rest of links */594594- REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),595595- /* on inflight list, should be cancelled and waited on exit reliably */596596- REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),597597- /* read/write uses file position */598598- REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),599599- /* must not punt to workers */600600- REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),601601- /* has or had linked timeout */602602- REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),603603- /* needs cleanup */604604- REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),605605- /* already went through poll handler */606606- REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),607607- /* buffer already selected */608608- REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),609609- /* buffer selected from ring, needs commit */610610- REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),611611- /* completion is deferred through io_comp_state */612612- REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),613613- /* caller should reissue async */614614- REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),615615- /* supports async reads/writes */616616- REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),617617- /* regular file */618618- REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),619619- /* has creds assigned */620620- REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),621621- /* skip refcounting if not set */622622- REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),623623- /* there is a linked timeout that has to be armed */624624- REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),625625- /* ->async_data allocated */626626- REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),627627- /* don't post CQEs while failing linked requests */628628- REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),629629- /* single poll may be active */630630- REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),631631- /* double poll may active */632632- REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),633633- /* request has already done partial IO */634634- REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),635635- /* fast poll multishot mode */636636- REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),637637- /* ->extra1 and ->extra2 are initialised */638638- REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),639639- /* recvmsg special flag, clear EPOLLIN */640640- REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),641641-};642642-643811struct async_poll {644812 struct io_poll poll;645813 struct io_poll *double_poll;646646-};647647-648648-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);649649-650650-struct io_task_work {651651- union {652652- struct io_wq_work_node node;653653- struct llist_node fallback_node;654654- };655655- io_req_tw_func_t func;656814};657815658816enum {···544934 IORING_RSRC_BUFFER = 1,545935};546936547547-struct io_cqe {548548- __u64 user_data;549549- __s32 res;550550- /* fd initially, then cflags for completion */551551- union {552552- __u32 flags;553553- int fd;554554- };555555-};556556-557937enum {558938 IO_CHECK_CQ_OVERFLOW_BIT,559939 IO_CHECK_CQ_DROPPED_BIT,560560-};561561-562562-/*563563- * Each request type overlays its private data structure on top of this one.564564- * They must not exceed this one in size.565565- */566566-struct io_cmd_data {567567- struct file *file;568568- /* each command gets 56 bytes of data */569569- __u8 data[56];570570-};571571-572572-#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd)573573-#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)574574-575575-struct io_kiocb {576576- union {577577- /*578578- * NOTE! Each of the io_kiocb union members has the file pointer579579- * as the first entry in their struct definition. So you can580580- * access the file pointer through any of the sub-structs,581581- * or directly as just 'file' in this struct.582582- */583583- struct file *file;584584- struct io_cmd_data cmd;585585- };586586-587587- u8 opcode;588588- /* polled IO has completed */589589- u8 iopoll_completed;590590- /*591591- * Can be either a fixed buffer index, or used with provided buffers.592592- * For the latter, before issue it points to the buffer group ID,593593- * and after selection it points to the buffer ID itself.594594- */595595- u16 buf_index;596596- unsigned int flags;597597-598598- struct io_cqe cqe;599599-600600- struct io_ring_ctx *ctx;601601- struct task_struct *task;602602-603603- struct io_rsrc_node *rsrc_node;604604-605605- union {606606- /* store used ubuf, so we can prevent reloading */607607- struct io_mapped_ubuf *imu;608608-609609- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */610610- struct io_buffer *kbuf;611611-612612- /*613613- * stores buffer ID for ring provided buffers, valid IFF614614- * REQ_F_BUFFER_RING is set.615615- */616616- struct io_buffer_list *buf_list;617617- };618618-619619- union {620620- /* used by request caches, completion batching and iopoll */621621- struct io_wq_work_node comp_list;622622- /* cache ->apoll->events */623623- __poll_t apoll_events;624624- };625625- atomic_t refs;626626- atomic_t poll_refs;627627- struct io_task_work io_task_work;628628- /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */629629- union {630630- struct hlist_node hash_node;631631- struct {632632- u64 extra1;633633- u64 extra2;634634- };635635- };636636- /* internal polling, see IORING_FEAT_FAST_POLL */637637- struct async_poll *apoll;638638- /* opcode allocated if it needs to store data for async defer */639639- void *async_data;640640- /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */641641- struct io_kiocb *link;642642- /* custom credentials, valid IFF REQ_F_CREDS is set */643643- const struct cred *creds;644644- struct io_wq_work work;645940};646941647942struct io_tctx_node {
+496
io_uring/io_uring_types.h
···11+#ifndef IO_URING_TYPES_H22+#define IO_URING_TYPES_H33+44+#include <linux/blkdev.h>55+#include <linux/task_work.h>66+77+#include "io-wq.h"88+99+struct io_uring {1010+ u32 head ____cacheline_aligned_in_smp;1111+ u32 tail ____cacheline_aligned_in_smp;1212+};1313+1414+/*1515+ * This data is shared with the application through the mmap at offsets1616+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.1717+ *1818+ * The offsets to the member fields are published through struct1919+ * io_sqring_offsets when calling io_uring_setup.2020+ */2121+struct io_rings {2222+ /*2323+ * Head and tail offsets into the ring; the offsets need to be2424+ * masked to get valid indices.2525+ *2626+ * The kernel controls head of the sq ring and the tail of the cq ring,2727+ * and the application controls tail of the sq ring and the head of the2828+ * cq ring.2929+ */3030+ struct io_uring sq, cq;3131+ /*3232+ * Bitmasks to apply to head and tail offsets (constant, equals3333+ * ring_entries - 1)3434+ */3535+ u32 sq_ring_mask, cq_ring_mask;3636+ /* Ring sizes (constant, power of 2) */3737+ u32 sq_ring_entries, cq_ring_entries;3838+ /*3939+ * Number of invalid entries dropped by the kernel due to4040+ * invalid index stored in array4141+ *4242+ * Written by the kernel, shouldn't be modified by the4343+ * application (i.e. get number of "new events" by comparing to4444+ * cached value).4545+ *4646+ * After a new SQ head value was read by the application this4747+ * counter includes all submissions that were dropped reaching4848+ * the new SQ head (and possibly more).4949+ */5050+ u32 sq_dropped;5151+ /*5252+ * Runtime SQ flags5353+ *5454+ * Written by the kernel, shouldn't be modified by the5555+ * application.5656+ *5757+ * The application needs a full memory barrier before checking5858+ * for IORING_SQ_NEED_WAKEUP after updating the sq tail.5959+ */6060+ atomic_t sq_flags;6161+ /*6262+ * Runtime CQ flags6363+ *6464+ * Written by the application, shouldn't be modified by the6565+ * kernel.6666+ */6767+ u32 cq_flags;6868+ /*6969+ * Number of completion events lost because the queue was full;7070+ * this should be avoided by the application by making sure7171+ * there are not more requests pending than there is space in7272+ * the completion queue.7373+ *7474+ * Written by the kernel, shouldn't be modified by the7575+ * application (i.e. get number of "new events" by comparing to7676+ * cached value).7777+ *7878+ * As completion events come in out of order this counter is not7979+ * ordered with any other data.8080+ */8181+ u32 cq_overflow;8282+ /*8383+ * Ring buffer of completion events.8484+ *8585+ * The kernel writes completion events fresh every time they are8686+ * produced, so the application is allowed to modify pending8787+ * entries.8888+ */8989+ struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;9090+};9191+9292+struct io_restriction {9393+ DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);9494+ DECLARE_BITMAP(sqe_op, IORING_OP_LAST);9595+ u8 sqe_flags_allowed;9696+ u8 sqe_flags_required;9797+ bool registered;9898+};9999+100100+struct io_submit_link {101101+ struct io_kiocb *head;102102+ struct io_kiocb *last;103103+};104104+105105+struct io_submit_state {106106+ /* inline/task_work completion list, under ->uring_lock */107107+ struct io_wq_work_node free_list;108108+ /* batch completion logic */109109+ struct io_wq_work_list compl_reqs;110110+ struct io_submit_link link;111111+112112+ bool plug_started;113113+ bool need_plug;114114+ bool flush_cqes;115115+ unsigned short submit_nr;116116+ struct blk_plug plug;117117+};118118+119119+struct io_ev_fd {120120+ struct eventfd_ctx *cq_ev_fd;121121+ unsigned int eventfd_async: 1;122122+ struct rcu_head rcu;123123+};124124+125125+struct io_file_table {126126+ struct io_fixed_file *files;127127+ unsigned long *bitmap;128128+ unsigned int alloc_hint;129129+};130130+131131+struct io_ring_ctx {132132+ /* const or read-mostly hot data */133133+ struct {134134+ struct percpu_ref refs;135135+136136+ struct io_rings *rings;137137+ unsigned int flags;138138+ enum task_work_notify_mode notify_method;139139+ unsigned int compat: 1;140140+ unsigned int drain_next: 1;141141+ unsigned int restricted: 1;142142+ unsigned int off_timeout_used: 1;143143+ unsigned int drain_active: 1;144144+ unsigned int drain_disabled: 1;145145+ unsigned int has_evfd: 1;146146+ unsigned int syscall_iopoll: 1;147147+ } ____cacheline_aligned_in_smp;148148+149149+ /* submission data */150150+ struct {151151+ struct mutex uring_lock;152152+153153+ /*154154+ * Ring buffer of indices into array of io_uring_sqe, which is155155+ * mmapped by the application using the IORING_OFF_SQES offset.156156+ *157157+ * This indirection could e.g. be used to assign fixed158158+ * io_uring_sqe entries to operations and only submit them to159159+ * the queue when needed.160160+ *161161+ * The kernel modifies neither the indices array nor the entries162162+ * array.163163+ */164164+ u32 *sq_array;165165+ struct io_uring_sqe *sq_sqes;166166+ unsigned cached_sq_head;167167+ unsigned sq_entries;168168+ struct list_head defer_list;169169+170170+ /*171171+ * Fixed resources fast path, should be accessed only under172172+ * uring_lock, and updated through io_uring_register(2)173173+ */174174+ struct io_rsrc_node *rsrc_node;175175+ int rsrc_cached_refs;176176+ atomic_t cancel_seq;177177+ struct io_file_table file_table;178178+ unsigned nr_user_files;179179+ unsigned nr_user_bufs;180180+ struct io_mapped_ubuf **user_bufs;181181+182182+ struct io_submit_state submit_state;183183+184184+ struct io_buffer_list *io_bl;185185+ struct xarray io_bl_xa;186186+ struct list_head io_buffers_cache;187187+188188+ struct list_head timeout_list;189189+ struct list_head ltimeout_list;190190+ struct list_head cq_overflow_list;191191+ struct list_head apoll_cache;192192+ struct xarray personalities;193193+ u32 pers_next;194194+ unsigned sq_thread_idle;195195+ } ____cacheline_aligned_in_smp;196196+197197+ /* IRQ completion list, under ->completion_lock */198198+ struct io_wq_work_list locked_free_list;199199+ unsigned int locked_free_nr;200200+201201+ const struct cred *sq_creds; /* cred used for __io_sq_thread() */202202+ struct io_sq_data *sq_data; /* if using sq thread polling */203203+204204+ struct wait_queue_head sqo_sq_wait;205205+ struct list_head sqd_list;206206+207207+ unsigned long check_cq;208208+209209+ struct {210210+ /*211211+ * We cache a range of free CQEs we can use, once exhausted it212212+ * should go through a slower range setup, see __io_get_cqe()213213+ */214214+ struct io_uring_cqe *cqe_cached;215215+ struct io_uring_cqe *cqe_sentinel;216216+217217+ unsigned cached_cq_tail;218218+ unsigned cq_entries;219219+ struct io_ev_fd __rcu *io_ev_fd;220220+ struct wait_queue_head cq_wait;221221+ unsigned cq_extra;222222+ atomic_t cq_timeouts;223223+ unsigned cq_last_tm_flush;224224+ } ____cacheline_aligned_in_smp;225225+226226+ struct {227227+ spinlock_t completion_lock;228228+229229+ spinlock_t timeout_lock;230230+231231+ /*232232+ * ->iopoll_list is protected by the ctx->uring_lock for233233+ * io_uring instances that don't use IORING_SETUP_SQPOLL.234234+ * For SQPOLL, only the single threaded io_sq_thread() will235235+ * manipulate the list, hence no extra locking is needed there.236236+ */237237+ struct io_wq_work_list iopoll_list;238238+ struct hlist_head *cancel_hash;239239+ unsigned cancel_hash_bits;240240+ bool poll_multi_queue;241241+242242+ struct list_head io_buffers_comp;243243+ } ____cacheline_aligned_in_smp;244244+245245+ struct io_restriction restrictions;246246+247247+ /* slow path rsrc auxilary data, used by update/register */248248+ struct {249249+ struct io_rsrc_node *rsrc_backup_node;250250+ struct io_mapped_ubuf *dummy_ubuf;251251+ struct io_rsrc_data *file_data;252252+ struct io_rsrc_data *buf_data;253253+254254+ struct delayed_work rsrc_put_work;255255+ struct llist_head rsrc_put_llist;256256+ struct list_head rsrc_ref_list;257257+ spinlock_t rsrc_ref_lock;258258+259259+ struct list_head io_buffers_pages;260260+ };261261+262262+ /* Keep this last, we don't need it for the fast path */263263+ struct {264264+ #if defined(CONFIG_UNIX)265265+ struct socket *ring_sock;266266+ #endif267267+ /* hashed buffered write serialization */268268+ struct io_wq_hash *hash_map;269269+270270+ /* Only used for accounting purposes */271271+ struct user_struct *user;272272+ struct mm_struct *mm_account;273273+274274+ /* ctx exit and cancelation */275275+ struct llist_head fallback_llist;276276+ struct delayed_work fallback_work;277277+ struct work_struct exit_work;278278+ struct list_head tctx_list;279279+ struct completion ref_comp;280280+ u32 iowq_limits[2];281281+ bool iowq_limits_set;282282+ };283283+};284284+285285+enum {286286+ REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,287287+ REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,288288+ REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,289289+ REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,290290+ REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,291291+ REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,292292+ REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,293293+294294+ /* first byte is taken by user flags, shift it to not overlap */295295+ REQ_F_FAIL_BIT = 8,296296+ REQ_F_INFLIGHT_BIT,297297+ REQ_F_CUR_POS_BIT,298298+ REQ_F_NOWAIT_BIT,299299+ REQ_F_LINK_TIMEOUT_BIT,300300+ REQ_F_NEED_CLEANUP_BIT,301301+ REQ_F_POLLED_BIT,302302+ REQ_F_BUFFER_SELECTED_BIT,303303+ REQ_F_BUFFER_RING_BIT,304304+ REQ_F_COMPLETE_INLINE_BIT,305305+ REQ_F_REISSUE_BIT,306306+ REQ_F_CREDS_BIT,307307+ REQ_F_REFCOUNT_BIT,308308+ REQ_F_ARM_LTIMEOUT_BIT,309309+ REQ_F_ASYNC_DATA_BIT,310310+ REQ_F_SKIP_LINK_CQES_BIT,311311+ REQ_F_SINGLE_POLL_BIT,312312+ REQ_F_DOUBLE_POLL_BIT,313313+ REQ_F_PARTIAL_IO_BIT,314314+ REQ_F_CQE32_INIT_BIT,315315+ REQ_F_APOLL_MULTISHOT_BIT,316316+ REQ_F_CLEAR_POLLIN_BIT,317317+ /* keep async read/write and isreg together and in order */318318+ REQ_F_SUPPORT_NOWAIT_BIT,319319+ REQ_F_ISREG_BIT,320320+321321+ /* not a real bit, just to check we're not overflowing the space */322322+ __REQ_F_LAST_BIT,323323+};324324+325325+enum {326326+ /* ctx owns file */327327+ REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),328328+ /* drain existing IO first */329329+ REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),330330+ /* linked sqes */331331+ REQ_F_LINK = BIT(REQ_F_LINK_BIT),332332+ /* doesn't sever on completion < 0 */333333+ REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),334334+ /* IOSQE_ASYNC */335335+ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),336336+ /* IOSQE_BUFFER_SELECT */337337+ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),338338+ /* IOSQE_CQE_SKIP_SUCCESS */339339+ REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),340340+341341+ /* fail rest of links */342342+ REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),343343+ /* on inflight list, should be cancelled and waited on exit reliably */344344+ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),345345+ /* read/write uses file position */346346+ REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),347347+ /* must not punt to workers */348348+ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),349349+ /* has or had linked timeout */350350+ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),351351+ /* needs cleanup */352352+ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),353353+ /* already went through poll handler */354354+ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),355355+ /* buffer already selected */356356+ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),357357+ /* buffer selected from ring, needs commit */358358+ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),359359+ /* completion is deferred through io_comp_state */360360+ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),361361+ /* caller should reissue async */362362+ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),363363+ /* supports async reads/writes */364364+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),365365+ /* regular file */366366+ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),367367+ /* has creds assigned */368368+ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),369369+ /* skip refcounting if not set */370370+ REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),371371+ /* there is a linked timeout that has to be armed */372372+ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),373373+ /* ->async_data allocated */374374+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),375375+ /* don't post CQEs while failing linked requests */376376+ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),377377+ /* single poll may be active */378378+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),379379+ /* double poll may active */380380+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),381381+ /* request has already done partial IO */382382+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),383383+ /* fast poll multishot mode */384384+ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),385385+ /* ->extra1 and ->extra2 are initialised */386386+ REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),387387+ /* recvmsg special flag, clear EPOLLIN */388388+ REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),389389+};390390+391391+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);392392+393393+struct io_task_work {394394+ union {395395+ struct io_wq_work_node node;396396+ struct llist_node fallback_node;397397+ };398398+ io_req_tw_func_t func;399399+};400400+401401+struct io_cqe {402402+ __u64 user_data;403403+ __s32 res;404404+ /* fd initially, then cflags for completion */405405+ union {406406+ __u32 flags;407407+ int fd;408408+ };409409+};410410+411411+/*412412+ * Each request type overlays its private data structure on top of this one.413413+ * They must not exceed this one in size.414414+ */415415+struct io_cmd_data {416416+ struct file *file;417417+ /* each command gets 56 bytes of data */418418+ __u8 data[56];419419+};420420+421421+#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd)422422+#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)423423+424424+struct io_kiocb {425425+ union {426426+ /*427427+ * NOTE! Each of the io_kiocb union members has the file pointer428428+ * as the first entry in their struct definition. So you can429429+ * access the file pointer through any of the sub-structs,430430+ * or directly as just 'file' in this struct.431431+ */432432+ struct file *file;433433+ struct io_cmd_data cmd;434434+ };435435+436436+ u8 opcode;437437+ /* polled IO has completed */438438+ u8 iopoll_completed;439439+ /*440440+ * Can be either a fixed buffer index, or used with provided buffers.441441+ * For the latter, before issue it points to the buffer group ID,442442+ * and after selection it points to the buffer ID itself.443443+ */444444+ u16 buf_index;445445+ unsigned int flags;446446+447447+ struct io_cqe cqe;448448+449449+ struct io_ring_ctx *ctx;450450+ struct task_struct *task;451451+452452+ struct io_rsrc_node *rsrc_node;453453+454454+ union {455455+ /* store used ubuf, so we can prevent reloading */456456+ struct io_mapped_ubuf *imu;457457+458458+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */459459+ struct io_buffer *kbuf;460460+461461+ /*462462+ * stores buffer ID for ring provided buffers, valid IFF463463+ * REQ_F_BUFFER_RING is set.464464+ */465465+ struct io_buffer_list *buf_list;466466+ };467467+468468+ union {469469+ /* used by request caches, completion batching and iopoll */470470+ struct io_wq_work_node comp_list;471471+ /* cache ->apoll->events */472472+ __poll_t apoll_events;473473+ };474474+ atomic_t refs;475475+ atomic_t poll_refs;476476+ struct io_task_work io_task_work;477477+ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */478478+ union {479479+ struct hlist_node hash_node;480480+ struct {481481+ u64 extra1;482482+ u64 extra2;483483+ };484484+ };485485+ /* internal polling, see IORING_FEAT_FAST_POLL */486486+ struct async_poll *apoll;487487+ /* opcode allocated if it needs to store data for async defer */488488+ void *async_data;489489+ /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */490490+ struct io_kiocb *link;491491+ /* custom credentials, valid IFF REQ_F_CREDS is set */492492+ const struct cred *creds;493493+ struct io_wq_work work;494494+};495495+496496+#endif