···330330 external submit_close : t -> Unix.file_descr -> id -> bool = "ocaml_uring_submit_close" [@@noalloc]
331331 external submit_statx : t -> id -> Unix.file_descr -> Statx.t -> Sketch.ptr -> int -> int -> bool = "ocaml_uring_submit_statx_byte" "ocaml_uring_submit_statx_native" [@@noalloc]
332332 external submit_splice : t -> id -> Unix.file_descr -> Unix.file_descr -> int -> bool = "ocaml_uring_submit_splice" [@@noalloc]
333333+ external submit_bind : t -> id -> Unix.file_descr -> Sockaddr.t -> bool = "ocaml_uring_submit_bind" [@@noalloc]
334334+ external submit_listen : t -> id -> Unix.file_descr -> int -> bool = "ocaml_uring_submit_listen" [@@noalloc]
333335 external submit_connect : t -> id -> Unix.file_descr -> Sockaddr.t -> bool = "ocaml_uring_submit_connect" [@@noalloc]
334336 external submit_accept : t -> id -> Unix.file_descr -> Sockaddr.t -> bool = "ocaml_uring_submit_accept" [@@noalloc]
335337 external submit_cancel : t -> id -> id -> bool = "ocaml_uring_submit_cancel" [@@noalloc]
···545547546548let splice t ~src ~dst ~len user_data =
547549 with_id t (fun id -> Uring.submit_splice t.uring id src dst len) user_data
550550+551551+let bind t fd addr user_data =
552552+ let addr = Sockaddr.of_unix addr in
553553+ with_id_full t (fun id -> Uring.submit_bind t.uring id fd addr) user_data ~extra_data:addr
554554+555555+let listen t fd backlog user_data =
556556+ with_id t (fun id -> Uring.submit_listen t.uring id fd backlog) user_data
548557549558let connect t fd addr user_data =
550559 let addr = Sockaddr.of_unix addr in
+6
lib/uring/uring.mli
···595595(** [statx t ?fd ~mask path stat flags] stats [path], which is resolved relative to [fd]
596596 (or the current directory if [fd] is not given). *)
597597598598+val bind : 'a t -> Unix.file_descr -> Unix.sockaddr -> 'a -> 'a job option
599599+(** [bind t fd addr d] will submit a request to bind [fd] to [addr]. *)
600600+601601+val listen : 'a t -> Unix.file_descr -> int -> 'a -> 'a job option
602602+(** [listen t fd backlog d] will submit a request to listen on [fd] with [backlog] maximum pending connections. *)
603603+598604val connect : 'a t -> Unix.file_descr -> Unix.sockaddr -> 'a -> 'a job option
599605(** [connect t fd addr d] will submit a request to connect [fd] to [addr]. *)
600606
+24
lib/uring/uring_stubs.c
···778778779779// v_sockaddr must not be GC'd while the call is in progress
780780value /* noalloc */
781781+ocaml_uring_submit_bind(value v_uring, value v_id, value v_fd, value v_sockaddr) {
782782+ struct io_uring *ring = Ring_val(v_uring);
783783+ struct io_uring_sqe *sqe;
784784+ struct sock_addr_data *addr = Sock_addr_val(v_sockaddr);
785785+ sqe = io_uring_get_sqe(ring);
786786+ if (!sqe) return (Val_false);
787787+ io_uring_prep_bind(sqe, Int_val(v_fd), &(addr->sock_addr_addr.s_gen), addr->sock_addr_len);
788788+ io_uring_sqe_set_data(sqe, (void *)Long_val(v_id));
789789+ return (Val_true);
790790+}
791791+792792+value /* noalloc */
793793+ocaml_uring_submit_listen(value v_uring, value v_id, value v_fd, value v_backlog) {
794794+ struct io_uring *ring = Ring_val(v_uring);
795795+ struct io_uring_sqe *sqe;
796796+ sqe = io_uring_get_sqe(ring);
797797+ if (!sqe) return (Val_false);
798798+ io_uring_prep_listen(sqe, Int_val(v_fd), Int_val(v_backlog));
799799+ io_uring_sqe_set_data(sqe, (void *)Long_val(v_id));
800800+ return (Val_true);
801801+}
802802+803803+// v_sockaddr must not be GC'd while the call is in progress
804804+value /* noalloc */
781805ocaml_uring_submit_connect(value v_uring, value v_id, value v_fd, value v_sockaddr) {
782806 struct io_uring *ring = Ring_val(v_uring);
783807 struct io_uring_sqe *sqe;
···11+open Printf
22+33+let () =
44+ let queue_depth = 128 in
55+ let t = Uring.create ~queue_depth () in
66+77+ (* Create server socket - Unix.socket is necessary as io_uring doesn't have socket creation *)
88+ let server_sock = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in
99+ Unix.set_nonblock server_sock;
1010+ printf "Server socket created\n";
1111+1212+ (* Create an address to bind to *)
1313+ let addr = Unix.ADDR_INET (Unix.inet_addr_loopback, 0) in
1414+1515+ (* Use io_uring for bind operation *)
1616+ let bind_result =
1717+ match Uring.bind t server_sock addr () with
1818+ | None -> failwith "Failed to submit bind operation"
1919+ | Some _job ->
2020+ let _submitted = Uring.submit t in
2121+ match Uring.wait t with
2222+ | Uring.None -> failwith "No completion for bind"
2323+ | Uring.Some { result; data = _ } ->
2424+ if result < 0 then begin
2525+ Uring.close t server_sock () |> ignore;
2626+ Uring.submit t |> ignore;
2727+ Uring.exit t;
2828+ let err = Uring.error_of_errno (-result) in
2929+ failwith (sprintf "Bind failed: %s" (Unix.error_message err))
3030+ end else
3131+ result
3232+ in
3333+ printf "Bind completed with result: %d\n" bind_result;
3434+3535+ (* Use io_uring for listen operation *)
3636+ let backlog = 10 in
3737+ let listen_result =
3838+ match Uring.listen t server_sock backlog () with
3939+ | None -> failwith "Failed to submit listen operation"
4040+ | Some _job ->
4141+ let _submitted = Uring.submit t in
4242+ match Uring.wait t with
4343+ | Uring.None -> failwith "No completion for listen"
4444+ | Uring.Some { result; data = _ } ->
4545+ if result < 0 then begin
4646+ Uring.close t server_sock () |> ignore;
4747+ Uring.submit t |> ignore;
4848+ Uring.exit t;
4949+ let err = Uring.error_of_errno (-result) in
5050+ failwith (sprintf "Listen failed: %s" (Unix.error_message err))
5151+ end else
5252+ result
5353+ in
5454+ printf "Listen completed with result: %d\n" listen_result;
5555+5656+ (* Get the actual bound port - Unix.getsockname is necessary for socket introspection *)
5757+ let actual_addr = Unix.getsockname server_sock in
5858+ let port = match actual_addr with
5959+ | Unix.ADDR_INET (_, p) -> p
6060+ | _ -> failwith "Unexpected address type"
6161+ in
6262+ printf "Socket bound and listening on port: %d\n" port;
6363+6464+ (* Test connecting to the bound socket *)
6565+ printf "Testing connection to the bound socket...\n";
6666+6767+ (* Create client socket - Unix.socket is necessary as io_uring doesn't have socket creation *)
6868+ let client_sock = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in
6969+ Unix.set_nonblock client_sock;
7070+ printf "Client socket created\n";
7171+7272+ (* Use io_uring for connect operation *)
7373+ let connect_addr = Unix.ADDR_INET (Unix.inet_addr_loopback, port) in
7474+ let connect_result =
7575+ match Uring.connect t client_sock connect_addr () with
7676+ | None -> failwith "Failed to submit connect operation"
7777+ | Some _job ->
7878+ let _submitted = Uring.submit t in
7979+ match Uring.wait t with
8080+ | Uring.None -> failwith "No completion for connect"
8181+ | Uring.Some { result; data = _ } ->
8282+ (* Connect may return -EINPROGRESS for non-blocking sockets, which is normal *)
8383+ if result < 0 && result <> (-115) (* -EINPROGRESS *) then begin
8484+ Uring.close t client_sock () |> ignore;
8585+ Uring.close t server_sock () |> ignore;
8686+ Uring.submit t |> ignore;
8787+ Uring.exit t;
8888+ let err = Uring.error_of_errno (-result) in
8989+ failwith (sprintf "Connect failed: %s (errno: %d)" (Unix.error_message err) (-result))
9090+ end else
9191+ result
9292+ in
9393+9494+ if connect_result = 0 || connect_result = (-115) then
9595+ printf "Connect initiated successfully (result: %d)\n" connect_result
9696+ else
9797+ printf "Connect completed with result: %d\n" connect_result;
9898+9999+ (* Get the client socket's local port - Unix.getsockname is necessary for socket introspection *)
100100+ let client_addr = Unix.getsockname client_sock in
101101+ let client_port = match client_addr with
102102+ | Unix.ADDR_INET (_, p) -> p
103103+ | _ -> failwith "Unexpected address type"
104104+ in
105105+ printf "Client socket connected from port: %d to port: %d\n" client_port port;
106106+107107+ (* Clean up using io_uring close operations *)
108108+ begin match Uring.close t client_sock () with
109109+ | None -> failwith "Failed to submit close for client socket"
110110+ | Some _ -> ()
111111+ end;
112112+113113+ begin match Uring.close t server_sock () with
114114+ | None -> failwith "Failed to submit close for server socket"
115115+ | Some _ -> ()
116116+ end;
117117+118118+ let _submitted = Uring.submit t in
119119+120120+ (* Wait for both close operations to complete *)
121121+ let rec wait_closes pending =
122122+ if pending > 0 then
123123+ match Uring.wait t with
124124+ | Uring.None -> failwith "No completion for close"
125125+ | Uring.Some { result; data = _ } ->
126126+ if result < 0 then
127127+ printf "Close warning: %s\n" (Unix.error_message (Uring.error_of_errno (-result)));
128128+ wait_closes (pending - 1)
129129+ in
130130+ wait_closes 2;
131131+132132+ Uring.exit t;
133133+ printf "Test completed successfully!\n"
···11+liburing-2.7 release
22+33+- Man page updates
44+- Sync with kernel 6.10
55+ - send/recv bundle support
66+ - accept nowait and CQE_F_MORE
77+- Add and update test cases
88+- Fix io_uring_queue_init_mem() returning a value that was too small,
99+ potentially causing memory corruption in userspace by overwriting
1010+ 64 bytes beyond the returned value. Also add test case for that.
1111+- Add 64-bit length variants of io_uring_prep_{m,f}advise()
1212+- Add BIND/LISTEN support and helpers / man pages
1313+- Add io_uring_enable_rings.3 man page
1414+- Fix bug in io_uring_prep_read_multishot()
1515+- Fixup bundle test cases
1616+- Add fixed-hugepage test case
1717+- Fix io_uring_prep_fixed_fd_install.3 man page
1818+- Note 'len' == 0 requirement in io_uring_prep_send.3 man page
1919+- Fix some test cases for skipping on older kernels
2020+2121+liburing-2.6 release
2222+2323+- Add getsockopt and setsockopt socket commands
2424+- Add test cases to test/hardlink
2525+- Man page fixes
2626+- Add futex support, and test cases
2727+- Add waitid support, and test cases
2828+- Add read multishot, and test cases
2929+- Add support for IORING_SETUP_NO_SQARRAY
3030+- Use IORING_SETUP_NO_SQARRAY as the default
3131+- Add support for IORING_OP_FIXED_FD_INSTALL
3232+- Add io_uring_prep_fixed_fd_install() helper
3333+- Support for napi busy polling
3434+- Improve/add test cases
3535+- Man page fixes
3636+- Add sample 'proxy' example
3737+3838+liburing-2.5 release
3939+4040+- Add support for io_uring_prep_cmd_sock()
4141+- Add support for application allocated ring memory, for placing rings
4242+ in huge mem. Available through io_uring_queue_init_mem().
4343+- Add support for registered ring fds
4444+- Various documentation updates
4545+- Various fixes
4646+147liburing-2.4 release
248349- Add io_uring_{major,minor,check}_version() functions.
···1561 io_uring_prep_socket_direct() factor in being called with
1662 IORING_FILE_INDEX_ALLOC for allocating a direct descriptor.
1763- Add io_uring_prep_sendto() function.
6464+- Add io_uring_prep_cmd_sock() function.
18651966liburing-2.3 release
2067
+15-4
vendor/liburing/Makefile
···1111 @$(MAKE) -C test
1212 @$(MAKE) -C examples
13131414-.PHONY: all install default clean test
1515-.PHONY: FORCE cscope
1414+library:
1515+ @$(MAKE) -C src
16161717-partcheck: all
1818- @echo "make partcheck => TODO add tests with out kernel support"
1717+.PHONY: all install default clean test library
1818+.PHONY: FORCE cscope
19192020runtests: all
2121 @$(MAKE) -C test runtests
···6060 $(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man7
6161 $(INSTALL) -m 644 man/*.7 $(DESTDIR)$(mandir)/man7
62626363+uninstall:
6464+ @$(MAKE) -C src uninstall prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir)
6565+ @rm -f $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME).pc
6666+ @rm -f $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME)-ffi.pc
6767+ @rm -rf $(DESTDIR)$(mandir)/man2/io_uring*.2
6868+ @rm -rf $(DESTDIR)$(mandir)/man3/io_uring*.3
6969+ @rm -rf $(DESTDIR)$(mandir)/man7/io_uring*.7
7070+6371install-tests:
6472 @$(MAKE) -C test install prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir)
7373+7474+uninstall-tests:
7575+ @$(MAKE) -C test uninstall prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir)
65766677clean:
6778 @rm -f config-host.mak config-host.h cscope.out $(NAME).pc $(NAME)-ffi.pc test/*.dmesg
+1-1
vendor/liburing/README
···5454 # Prepare build config (optional).
5555 #
5656 # --cc specifies the C compiler.
5757- # --cxx speficies the C++ compiler.
5757+ # --cxx specifies the C++ compiler.
5858 #
5959 ./configure --cc=gcc --cxx=g++;
6060
+97-30
vendor/liburing/configure
···55cc=${CC:-gcc}
66cxx=${CXX:-g++}
7788-#
99-# TODO(ammarfaizi2): Remove this notice and `--nolibc` option.
1010-#
1111-nolibc_deprecated() {
1212- echo "";
1313- echo "=================================================================";
1414- echo "";
1515- echo " --nolibc option is deprecated and has no effect.";
1616- echo " It will be removed in a future liburing release.";
1717- echo "";
1818- echo " liburing on x86-64, x86 (32-bit) and aarch64 always use CONFIG_NOLIBC.";
1919- echo "";
2020- echo "=================================================================";
2121- echo "";
2222-}
2323-248for opt do
259 optarg=$(expr "x$opt" : 'x[^=]*=\(.*\)' || true)
2610 case "$opt" in
2711 --help|-h) show_help=yes
2812 ;;
2929- --prefix=*) prefix="$optarg"
1313+ --prefix=*) prefix="$(realpath -s $optarg)"
3014 ;;
3115 --includedir=*) includedir="$optarg"
3216 ;;
···4226 ;;
4327 --cxx=*) cxx="$optarg"
4428 ;;
4545- --nolibc) nolibc_deprecated
2929+ --use-libc) use_libc=yes
4630 ;;
4731 *)
4832 echo "ERROR: unknown option $opt"
···9175 --datadir=PATH install shared data in PATH [$datadir]
9276 --cc=CMD use CMD as the C compiler
9377 --cxx=CMD use CMD as the C++ compiler
9494- --nolibc build liburing without libc
7878+ --use-libc use libc for liburing (useful for hardening)
9579EOF
9680exit 0
9781fi
···218202print_and_output_mak "mandir" "$mandir"
219203print_and_output_mak "datadir" "$datadir"
220204205205+####################################################
206206+# Check for correct compiler runtime library to link with
207207+libgcc_link_flag="-lgcc"
208208+if $cc -print-libgcc-file-name >/dev/null 2>&1; then
209209+ libgcc_link_flag="$($cc $CFLAGS $LDFLAGS -print-libgcc-file-name)"
210210+fi
211211+print_and_output_mak "libgcc_link_flag" "$libgcc_link_flag"
212212+####################################################
213213+221214##########################################
222215# check for compiler -Wstringop-overflow
223216stringop_overflow="no"
···400393fi
401394print_config "NVMe uring command support" "$nvme_uring_cmd"
402395396396+##########################################
397397+# Check futexv support
398398+futexv="no"
399399+cat > $TMPC << EOF
400400+#include <linux/futex.h>
401401+#include <unistd.h>
402402+#include <string.h>
403403+int main(void)
404404+{
405405+ struct futex_waitv fw;
406406+407407+ memset(&fw, FUTEX_32, sizeof(fw));
408408+409409+ return sizeof(struct futex_waitv);
410410+}
411411+EOF
412412+if compile_prog "" "" "futexv"; then
413413+ futexv="yes"
414414+fi
415415+print_config "futex waitv support" "$futexv"
416416+417417+##########################################
418418+# Check idtype_t support
419419+has_idtype_t="no"
420420+cat > $TMPC << EOF
421421+#include <sys/wait.h>
422422+int main(void)
423423+{
424424+ idtype_t v;
425425+ return 0;
426426+}
427427+EOF
428428+if compile_prog "" "" "idtype_t"; then
429429+ has_idtype_t="yes"
430430+fi
431431+print_config "has_idtype_t" "$has_idtype_t"
432432+403433#############################################################################
404404-#
405405-# Currently, CONFIG_NOLIBC is only enabled on x86-64, x86 (32-bit) and aarch64.
406406-#
407407-cat > $TMPC << EOF
434434+liburing_nolibc="no"
435435+if test "$use_libc" != "yes"; then
436436+437437+ #
438438+ # Currently, CONFIG_NOLIBC only supports x86-64, x86 (32-bit), aarch64 and riscv64.
439439+ #
440440+ cat > $TMPC << EOF
408441int main(void){
409409-#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
442442+#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
410443 return 0;
411444#else
412445#error libc is needed
413446#endif
414447}
415448EOF
416416-if compile_prog "" "" "nolibc support"; then
417417- liburing_nolibc="yes"
449449+450450+ if compile_prog "" "" "nolibc"; then
451451+ liburing_nolibc="yes"
452452+ fi
418453fi
419419-print_config "nolibc support" "$liburing_nolibc";
454454+455455+print_config "nolibc" "$liburing_nolibc";
420456#############################################################################
421457422458####################################################
···471507if test "$has_fanotify" = "yes"; then
472508 output_sym "CONFIG_HAVE_FANOTIFY"
473509fi
510510+if test "$futexv" = "yes"; then
511511+ output_sym "CONFIG_HAVE_FUTEXV"
512512+fi
474513475514echo "CC=$cc" >> $config_host_mak
476515print_config "CC" "$cc"
···478517print_config "CXX" "$cxx"
479518480519# generate io_uring_version.h
481481-MAKE_PRINT_VARS="include Makefile.common\nprint-%: ; @echo \$(\$*)\n"
482482-VERSION_MAJOR=$(env echo -e "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MAJOR)
483483-VERSION_MINOR=$(env echo -e "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MINOR)
520520+# Reset MAKEFLAGS
521521+MAKEFLAGS=
522522+MAKE_PRINT_VARS="include Makefile.common\nprint-%%: ; @echo \$(\$*)\n"
523523+VERSION_MAJOR=$(printf "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MAJOR)
524524+VERSION_MINOR=$(printf "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MINOR)
484525io_uring_version_h="src/include/liburing/io_uring_version.h"
485526cat > $io_uring_version_h << EOF
486527/* SPDX-License-Identifier: MIT */
···551592552593EOF
553594fi
595595+if test "$futexv" != "yes"; then
596596+cat >> $compat_h << EOF
597597+#include <inttypes.h>
554598599599+#define FUTEX_32 2
600600+#define FUTEX_WAITV_MAX 128
601601+602602+struct futex_waitv {
603603+ uint64_t val;
604604+ uint64_t uaddr;
605605+ uint32_t flags;
606606+ uint32_t __reserved;
607607+};
608608+609609+EOF
610610+fi
611611+612612+if test "$has_idtype_t" != "yes"; then
613613+cat >> $compat_h << EOF
614614+typedef enum
615615+{
616616+ P_ALL, /* Wait for any child. */
617617+ P_PID, /* Wait for specified process. */
618618+ P_PGID /* Wait for members of process group. */
619619+} idtype_t;
620620+EOF
621621+fi
555622cat >> $compat_h << EOF
556623#endif
557624EOF
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * Sample program that can act either as a packet sink, where it just receives
44+ * packets and doesn't do anything with them, or it can act as a proxy where it
55+ * receives packets and then sends them to a new destination. The proxy can
66+ * be unidirectional (-B0), or bi-direction (-B1).
77+ *
88+ * Examples:
99+ *
1010+ * Act as a proxy, listening on port 4444, and send data to 192.168.2.6 on port
1111+ * 4445. Use multishot receive, DEFER_TASKRUN, and fixed files
1212+ *
1313+ * ./proxy -m1 -r4444 -H 192.168.2.6 -p4445
1414+ *
1515+ * Same as above, but utilize send bundles (-C1, requires -u1 send_ring) as well
1616+ * with ring provided send buffers, and recv bundles (-c1).
1717+ *
1818+ * ./proxy -m1 -c1 -u1 -C1 -r4444 -H 192.168.2.6 -p4445
1919+ *
2020+ * Act as a bi-directional proxy, listening on port 8888, and send data back
2121+ * and forth between host and 192.168.2.6 on port 22. Use multishot receive,
2222+ * DEFER_TASKRUN, fixed files, and buffers of size 1500.
2323+ *
2424+ * ./proxy -m1 -B1 -b1500 -r8888 -H 192.168.2.6 -p22
2525+ *
2626+ * Act a sink, listening on port 4445, using multishot receive, DEFER_TASKRUN,
2727+ * and fixed files:
2828+ *
2929+ * ./proxy -m1 -s1 -r4445
3030+ *
3131+ * Run with -h to see a list of options, and their defaults.
3232+ *
3333+ * (C) 2024 Jens Axboe <axboe@kernel.dk>
3434+ *
3535+ */
3636+#include <fcntl.h>
3737+#include <stdint.h>
3838+#include <netinet/in.h>
3939+#include <netinet/tcp.h>
4040+#include <arpa/inet.h>
4141+#include <stdio.h>
4242+#include <stdlib.h>
4343+#include <string.h>
4444+#include <sys/socket.h>
4545+#include <sys/time.h>
4646+#include <unistd.h>
4747+#include <sys/mman.h>
4848+#include <linux/mman.h>
4949+#include <locale.h>
5050+#include <assert.h>
5151+#include <pthread.h>
5252+#include <liburing.h>
5353+5454+#include "proxy.h"
5555+#include "helpers.h"
5656+5757+/*
5858+ * Will go away once/if bundles are upstreamed and we put the generic
5959+ * definitions in the kernel header.
6060+ */
6161+#ifndef IORING_RECVSEND_BUNDLE
6262+#define IORING_RECVSEND_BUNDLE (1U << 4)
6363+#endif
6464+#ifndef IORING_FEAT_SEND_BUF_SELECT
6565+#define IORING_FEAT_SEND_BUF_SELECT (1U << 14)
6666+#endif
6767+6868+static int cur_bgid = 1;
6969+static int nr_conns;
7070+static int open_conns;
7171+static long page_size;
7272+7373+static unsigned long event_loops;
7474+static unsigned long events;
7575+7676+static int recv_mshot = 1;
7777+static int sqpoll;
7878+static int defer_tw = 1;
7979+static int is_sink;
8080+static int fixed_files = 1;
8181+static char *host = "192.168.3.2";
8282+static int send_port = 4445;
8383+static int receive_port = 4444;
8484+static int buf_size = 32;
8585+static int bidi;
8686+static int ipv6;
8787+static int napi;
8888+static int napi_timeout;
8989+static int wait_batch = 1;
9090+static int wait_usec = 1000000;
9191+static int rcv_msg;
9292+static int snd_msg;
9393+static int snd_zc;
9494+static int send_ring = -1;
9595+static int snd_bundle;
9696+static int rcv_bundle;
9797+static int use_huge;
9898+static int ext_stat;
9999+static int verbose;
100100+101101+static int nr_bufs = 256;
102102+static int br_mask;
103103+104104+static int ring_size = 128;
105105+106106+static pthread_mutex_t thread_lock;
107107+static struct timeval last_housekeeping;
108108+109109+/*
110110+ * For sendmsg/recvmsg. recvmsg just has a single vec, sendmsg will have
111111+ * two vecs - one that is currently submitted and being sent, and one that
112112+ * is being prepared. When a new sendmsg is issued, we'll swap which one we
113113+ * use. For send, even though we don't pass in the iovec itself, we use the
114114+ * vec to serialize the sends to avoid reordering.
115115+ */
116116+struct msg_vec {
117117+ struct iovec *iov;
118118+ /* length of allocated vec */
119119+ int vec_size;
120120+ /* length currently being used */
121121+ int iov_len;
122122+ /* only for send, current index we're processing */
123123+ int cur_iov;
124124+};
125125+126126+struct io_msg {
127127+ struct msghdr msg;
128128+ struct msg_vec vecs[2];
129129+ /* current msg_vec being prepared */
130130+ int vec_index;
131131+};
132132+133133+/*
134134+ * Per socket stats per connection. For bi-directional, we'll have both
135135+ * sends and receives on each socket, this helps track them seperately.
136136+ * For sink or one directional, each of the two stats will be only sends
137137+ * or receives, not both.
138138+ */
139139+struct conn_dir {
140140+ int index;
141141+142142+ int pending_shutdown;
143143+ int pending_send;
144144+ int pending_recv;
145145+146146+ int snd_notif;
147147+148148+ int out_buffers;
149149+150150+ int rcv, rcv_shrt, rcv_enobufs, rcv_mshot;
151151+ int snd, snd_shrt, snd_enobufs, snd_busy, snd_mshot;
152152+153153+ int snd_next_bid;
154154+ int rcv_next_bid;
155155+156156+ int *rcv_bucket;
157157+ int *snd_bucket;
158158+159159+ unsigned long in_bytes, out_bytes;
160160+161161+ /* only ever have a single recv pending */
162162+ struct io_msg io_rcv_msg;
163163+164164+ /* one send that is inflight, and one being prepared for the next one */
165165+ struct io_msg io_snd_msg;
166166+};
167167+168168+enum {
169169+ CONN_F_STARTED = 1,
170170+ CONN_F_DISCONNECTING = 2,
171171+ CONN_F_DISCONNECTED = 4,
172172+ CONN_F_PENDING_SHUTDOWN = 8,
173173+ CONN_F_STATS_SHOWN = 16,
174174+ CONN_F_END_TIME = 32,
175175+ CONN_F_REAPED = 64,
176176+};
177177+178178+/*
179179+ * buffer ring belonging to a connection
180180+ */
181181+struct conn_buf_ring {
182182+ struct io_uring_buf_ring *br;
183183+ void *buf;
184184+ int bgid;
185185+};
186186+187187+struct conn {
188188+ struct io_uring ring;
189189+190190+ /* receive side buffer ring, new data arrives here */
191191+ struct conn_buf_ring in_br;
192192+ /* if send_ring is used, outgoing data to send */
193193+ struct conn_buf_ring out_br;
194194+195195+ int tid;
196196+ int in_fd, out_fd;
197197+ int pending_cancels;
198198+ int flags;
199199+200200+ struct conn_dir cd[2];
201201+202202+ struct timeval start_time, end_time;
203203+204204+ union {
205205+ struct sockaddr_in addr;
206206+ struct sockaddr_in6 addr6;
207207+ };
208208+209209+ pthread_t thread;
210210+ pthread_barrier_t startup_barrier;
211211+};
212212+213213+#define MAX_CONNS 1024
214214+static struct conn conns[MAX_CONNS];
215215+216216+#define vlog(str, ...) do { \
217217+ if (verbose) \
218218+ printf(str, ##__VA_ARGS__); \
219219+} while (0)
220220+221221+static int prep_next_send(struct io_uring *ring, struct conn *c,
222222+ struct conn_dir *cd, int fd);
223223+static void *thread_main(void *data);
224224+225225+static struct conn *cqe_to_conn(struct io_uring_cqe *cqe)
226226+{
227227+ struct userdata ud = { .val = cqe->user_data };
228228+229229+ return &conns[ud.op_tid & TID_MASK];
230230+}
231231+232232+static struct conn_dir *cqe_to_conn_dir(struct conn *c,
233233+ struct io_uring_cqe *cqe)
234234+{
235235+ int fd = cqe_to_fd(cqe);
236236+237237+ return &c->cd[fd != c->in_fd];
238238+}
239239+240240+static int other_dir_fd(struct conn *c, int fd)
241241+{
242242+ if (c->in_fd == fd)
243243+ return c->out_fd;
244244+ return c->in_fd;
245245+}
246246+247247+/* currently active msg_vec */
248248+static struct msg_vec *msg_vec(struct io_msg *imsg)
249249+{
250250+ return &imsg->vecs[imsg->vec_index];
251251+}
252252+253253+static struct msg_vec *snd_msg_vec(struct conn_dir *cd)
254254+{
255255+ return msg_vec(&cd->io_snd_msg);
256256+}
257257+258258+/*
259259+ * Goes from accept new connection -> create socket, connect to end
260260+ * point, prepare recv, on receive do send (unless sink). If either ends
261261+ * disconnects, we transition to shutdown and then close.
262262+ */
263263+enum {
264264+ __ACCEPT = 1,
265265+ __SOCK = 2,
266266+ __CONNECT = 3,
267267+ __RECV = 4,
268268+ __RECVMSG = 5,
269269+ __SEND = 6,
270270+ __SENDMSG = 7,
271271+ __SHUTDOWN = 8,
272272+ __CANCEL = 9,
273273+ __CLOSE = 10,
274274+ __FD_PASS = 11,
275275+ __NOP = 12,
276276+ __STOP = 13,
277277+};
278278+279279+struct error_handler {
280280+ const char *name;
281281+ int (*error_fn)(struct error_handler *, struct io_uring *, struct io_uring_cqe *);
282282+};
283283+284284+static int recv_error(struct error_handler *err, struct io_uring *ring,
285285+ struct io_uring_cqe *cqe);
286286+static int send_error(struct error_handler *err, struct io_uring *ring,
287287+ struct io_uring_cqe *cqe);
288288+289289+static int default_error(struct error_handler *err,
290290+ struct io_uring __attribute__((__unused__)) *ring,
291291+ struct io_uring_cqe *cqe)
292292+{
293293+ struct conn *c = cqe_to_conn(cqe);
294294+295295+ fprintf(stderr, "%d: %s error %s\n", c->tid, err->name, strerror(-cqe->res));
296296+ fprintf(stderr, "fd=%d, bid=%d\n", cqe_to_fd(cqe), cqe_to_bid(cqe));
297297+ return 1;
298298+}
299299+300300+/*
301301+ * Move error handling out of the normal handling path, cleanly seperating
302302+ * them. If an opcode doesn't need any error handling, set it to NULL. If
303303+ * it wants to stop the connection at that point and not do anything else,
304304+ * then the default handler can be used. Only receive has proper error
305305+ * handling, as we can get -ENOBUFS which is not a fatal condition. It just
306306+ * means we need to wait on buffer replenishing before re-arming the receive.
307307+ */
308308+static struct error_handler error_handlers[] = {
309309+ { .name = "NULL", .error_fn = NULL, },
310310+ { .name = "ACCEPT", .error_fn = default_error, },
311311+ { .name = "SOCK", .error_fn = default_error, },
312312+ { .name = "CONNECT", .error_fn = default_error, },
313313+ { .name = "RECV", .error_fn = recv_error, },
314314+ { .name = "RECVMSG", .error_fn = recv_error, },
315315+ { .name = "SEND", .error_fn = send_error, },
316316+ { .name = "SENDMSG", .error_fn = send_error, },
317317+ { .name = "SHUTDOWN", .error_fn = NULL, },
318318+ { .name = "CANCEL", .error_fn = NULL, },
319319+ { .name = "CLOSE", .error_fn = NULL, },
320320+ { .name = "FD_PASS", .error_fn = default_error, },
321321+ { .name = "NOP", .error_fn = NULL, },
322322+ { .name = "STOP", .error_fn = default_error, },
323323+};
324324+325325+static void free_buffer_ring(struct io_uring *ring, struct conn_buf_ring *cbr)
326326+{
327327+ if (!cbr->br)
328328+ return;
329329+330330+ io_uring_free_buf_ring(ring, cbr->br, nr_bufs, cbr->bgid);
331331+ cbr->br = NULL;
332332+ if (use_huge)
333333+ munmap(cbr->buf, buf_size * nr_bufs);
334334+ else
335335+ free(cbr->buf);
336336+}
337337+338338+static void free_buffer_rings(struct io_uring *ring, struct conn *c)
339339+{
340340+ free_buffer_ring(ring, &c->in_br);
341341+ free_buffer_ring(ring, &c->out_br);
342342+}
343343+344344+/*
345345+ * Setup a ring provided buffer ring for each connection. If we get -ENOBUFS
346346+ * on receive, for multishot receive we'll wait for half the provided buffers
347347+ * to be returned by pending sends, then re-arm the multishot receive. If
348348+ * this happens too frequently (see enobufs= stat), then the ring size is
349349+ * likely too small. Use -nXX to make it bigger. See recv_enobufs().
350350+ *
351351+ * The alternative here would be to use the older style provided buffers,
352352+ * where you simply setup a buffer group and use SQEs with
353353+ * io_urign_prep_provide_buffers() to add to the pool. But that approach is
354354+ * slower and has been deprecated by using the faster ring provided buffers.
355355+ */
356356+static int setup_recv_ring(struct io_uring *ring, struct conn *c)
357357+{
358358+ struct conn_buf_ring *cbr = &c->in_br;
359359+ int ret, i;
360360+ size_t len;
361361+ void *ptr;
362362+363363+ len = buf_size * nr_bufs;
364364+ if (use_huge) {
365365+ cbr->buf = mmap(NULL, len, PROT_READ|PROT_WRITE,
366366+ MAP_PRIVATE|MAP_HUGETLB|MAP_HUGE_2MB|MAP_ANONYMOUS,
367367+ -1, 0);
368368+ if (cbr->buf == MAP_FAILED) {
369369+ perror("mmap");
370370+ return 1;
371371+ }
372372+ } else {
373373+ if (posix_memalign(&cbr->buf, page_size, len)) {
374374+ perror("posix memalign");
375375+ return 1;
376376+ }
377377+ }
378378+ cbr->br = io_uring_setup_buf_ring(ring, nr_bufs, cbr->bgid, 0, &ret);
379379+ if (!cbr->br) {
380380+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
381381+ return 1;
382382+ }
383383+384384+ ptr = cbr->buf;
385385+ for (i = 0; i < nr_bufs; i++) {
386386+ vlog("%d: add bid %d, data %p\n", c->tid, i, ptr);
387387+ io_uring_buf_ring_add(cbr->br, ptr, buf_size, i, br_mask, i);
388388+ ptr += buf_size;
389389+ }
390390+ io_uring_buf_ring_advance(cbr->br, nr_bufs);
391391+ printf("%d: recv buffer ring bgid %d, bufs %d\n", c->tid, cbr->bgid, nr_bufs);
392392+ return 0;
393393+}
394394+395395+/*
396396+ * If 'send_ring' is used and the kernel supports it, we can skip serializing
397397+ * sends as the data will be ordered regardless. This reduces the send handling
398398+ * complexity, as buffers can always be added to the outgoing ring and will be
399399+ * processed in the order in which they were added.
400400+ */
401401+static int setup_send_ring(struct io_uring *ring, struct conn *c)
402402+{
403403+ struct conn_buf_ring *cbr = &c->out_br;
404404+ int ret;
405405+406406+ cbr->br = io_uring_setup_buf_ring(ring, nr_bufs, cbr->bgid, 0, &ret);
407407+ if (!cbr->br) {
408408+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
409409+ return 1;
410410+ }
411411+412412+ printf("%d: send buffer ring bgid %d, bufs %d\n", c->tid, cbr->bgid, nr_bufs);
413413+ return 0;
414414+}
415415+416416+static int setup_send_zc(struct io_uring *ring, struct conn *c)
417417+{
418418+ struct iovec *iovs;
419419+ void *buf;
420420+ int i, ret;
421421+422422+ if (snd_msg)
423423+ return 0;
424424+425425+ buf = c->in_br.buf;
426426+ iovs = calloc(nr_bufs, sizeof(struct iovec));
427427+ for (i = 0; i < nr_bufs; i++) {
428428+ iovs[i].iov_base = buf;
429429+ iovs[i].iov_len = buf_size;
430430+ buf += buf_size;
431431+ }
432432+433433+ ret = io_uring_register_buffers(ring, iovs, nr_bufs);
434434+ if (ret) {
435435+ fprintf(stderr, "failed registering buffers: %d\n", ret);
436436+ free(iovs);
437437+ return ret;
438438+ }
439439+ free(iovs);
440440+ return 0;
441441+}
442442+443443+/*
444444+ * Setup an input and output buffer ring.
445445+ */
446446+static int setup_buffer_rings(struct io_uring *ring, struct conn *c)
447447+{
448448+ int ret;
449449+450450+ /* no locking needed on cur_bgid, parent serializes setup */
451451+ c->in_br.bgid = cur_bgid++;
452452+ c->out_br.bgid = cur_bgid++;
453453+ c->out_br.br = NULL;
454454+455455+ ret = setup_recv_ring(ring, c);
456456+ if (ret)
457457+ return ret;
458458+ if (is_sink)
459459+ return 0;
460460+ if (snd_zc) {
461461+ ret = setup_send_zc(ring, c);
462462+ if (ret)
463463+ return ret;
464464+ }
465465+ if (send_ring) {
466466+ ret = setup_send_ring(ring, c);
467467+ if (ret) {
468468+ free_buffer_ring(ring, &c->in_br);
469469+ return ret;
470470+ }
471471+ }
472472+473473+ return 0;
474474+}
475475+476476+struct bucket_stat {
477477+ int nr_packets;
478478+ int count;
479479+};
480480+481481+static int stat_cmp(const void *p1, const void *p2)
482482+{
483483+ const struct bucket_stat *b1 = p1;
484484+ const struct bucket_stat *b2 = p2;
485485+486486+ if (b1->count < b2->count)
487487+ return 1;
488488+ else if (b1->count > b2->count)
489489+ return -1;
490490+ return 0;
491491+}
492492+493493+static void show_buckets(struct conn_dir *cd)
494494+{
495495+ unsigned long snd_total, rcv_total;
496496+ struct bucket_stat *rstat, *sstat;
497497+ int i;
498498+499499+ if (!cd->rcv_bucket || !cd->snd_bucket)
500500+ return;
501501+502502+ rstat = calloc(nr_bufs + 1, sizeof(struct bucket_stat));
503503+ sstat = calloc(nr_bufs + 1, sizeof(struct bucket_stat));
504504+505505+ snd_total = rcv_total = 0;
506506+ for (i = 0; i <= nr_bufs; i++) {
507507+ snd_total += cd->snd_bucket[i];
508508+ sstat[i].nr_packets = i;
509509+ sstat[i].count = cd->snd_bucket[i];
510510+ rcv_total += cd->rcv_bucket[i];
511511+ rstat[i].nr_packets = i;
512512+ rstat[i].count = cd->rcv_bucket[i];
513513+ }
514514+515515+ if (!snd_total && !rcv_total) {
516516+ free(sstat);
517517+ free(rstat);
518518+ }
519519+ if (snd_total)
520520+ qsort(sstat, nr_bufs, sizeof(struct bucket_stat), stat_cmp);
521521+ if (rcv_total)
522522+ qsort(rstat, nr_bufs, sizeof(struct bucket_stat), stat_cmp);
523523+524524+ printf("\t Packets per recv/send:\n");
525525+ for (i = 0; i <= nr_bufs; i++) {
526526+ double snd_prc = 0.0, rcv_prc = 0.0;
527527+ if (!rstat[i].count && !sstat[i].count)
528528+ continue;
529529+ if (rstat[i].count)
530530+ rcv_prc = 100.0 * (rstat[i].count / (double) rcv_total);
531531+ if (sstat[i].count)
532532+ snd_prc = 100.0 * (sstat[i].count / (double) snd_total);
533533+ printf("\t bucket(%3d/%3d): rcv=%u (%.2f%%) snd=%u (%.2f%%)\n",
534534+ rstat[i].nr_packets, sstat[i].nr_packets,
535535+ rstat[i].count, rcv_prc,
536536+ sstat[i].count, snd_prc);
537537+ }
538538+539539+ free(sstat);
540540+ free(rstat);
541541+}
542542+543543+static void __show_stats(struct conn *c)
544544+{
545545+ unsigned long msec, qps;
546546+ unsigned long bytes, bw;
547547+ struct conn_dir *cd;
548548+ int i;
549549+550550+ if (c->flags & (CONN_F_STATS_SHOWN | CONN_F_REAPED))
551551+ return;
552552+ if (!(c->flags & CONN_F_STARTED))
553553+ return;
554554+555555+ if (!(c->flags & CONN_F_END_TIME))
556556+ gettimeofday(&c->end_time, NULL);
557557+558558+ msec = (c->end_time.tv_sec - c->start_time.tv_sec) * 1000;
559559+ msec += (c->end_time.tv_usec - c->start_time.tv_usec) / 1000;
560560+561561+ qps = 0;
562562+ for (i = 0; i < 2; i++)
563563+ qps += c->cd[i].rcv + c->cd[i].snd;
564564+565565+ if (!qps)
566566+ return;
567567+568568+ if (msec)
569569+ qps = (qps * 1000) / msec;
570570+571571+ printf("Conn %d/(in_fd=%d, out_fd=%d): qps=%lu, msec=%lu\n", c->tid,
572572+ c->in_fd, c->out_fd, qps, msec);
573573+574574+ bytes = 0;
575575+ for (i = 0; i < 2; i++) {
576576+ cd = &c->cd[i];
577577+578578+ if (!cd->in_bytes && !cd->out_bytes && !cd->snd && !cd->rcv)
579579+ continue;
580580+581581+ bytes += cd->in_bytes;
582582+ bytes += cd->out_bytes;
583583+584584+ printf("\t%3d: rcv=%u (short=%u, enobufs=%d), snd=%u (short=%u,"
585585+ " busy=%u, enobufs=%d)\n", i, cd->rcv, cd->rcv_shrt,
586586+ cd->rcv_enobufs, cd->snd, cd->snd_shrt, cd->snd_busy,
587587+ cd->snd_enobufs);
588588+ printf("\t : in_bytes=%lu (Kb %lu), out_bytes=%lu (Kb %lu)\n",
589589+ cd->in_bytes, cd->in_bytes >> 10,
590590+ cd->out_bytes, cd->out_bytes >> 10);
591591+ printf("\t : mshot_rcv=%d, mshot_snd=%d\n", cd->rcv_mshot,
592592+ cd->snd_mshot);
593593+ show_buckets(cd);
594594+595595+ }
596596+ if (msec) {
597597+ bytes *= 8UL;
598598+ bw = bytes / 1000;
599599+ bw /= msec;
600600+ printf("\tBW=%'luMbit\n", bw);
601601+ }
602602+603603+ c->flags |= CONN_F_STATS_SHOWN;
604604+}
605605+606606+static void show_stats(void)
607607+{
608608+ float events_per_loop = 0.0;
609609+ static int stats_shown;
610610+ int i;
611611+612612+ if (stats_shown)
613613+ return;
614614+615615+ if (events)
616616+ events_per_loop = (float) events / (float) event_loops;
617617+618618+ printf("Event loops: %lu, events %lu, events per loop %.2f\n", event_loops,
619619+ events, events_per_loop);
620620+621621+ for (i = 0; i < MAX_CONNS; i++) {
622622+ struct conn *c = &conns[i];
623623+624624+ __show_stats(c);
625625+ }
626626+ stats_shown = 1;
627627+}
628628+629629+static void sig_int(int __attribute__((__unused__)) sig)
630630+{
631631+ printf("\n");
632632+ show_stats();
633633+ exit(1);
634634+}
635635+636636+/*
637637+ * Special cased for SQPOLL only, as we don't control when SQEs are consumed if
638638+ * that is used. Hence we may need to wait for the SQPOLL thread to keep up
639639+ * until we can get a new SQE. All other cases will break immediately, with a
640640+ * fresh SQE.
641641+ *
642642+ * If we grossly undersized our SQ ring, getting a NULL sqe can happen even
643643+ * for the !SQPOLL case if we're handling a lot of CQEs in our event loop
644644+ * and multishot isn't used. We can do io_uring_submit() to flush what we
645645+ * have here. Only caveat here is that if linked requests are used, SQEs
646646+ * would need to be allocated upfront as a link chain is only valid within
647647+ * a single submission cycle.
648648+ */
649649+static struct io_uring_sqe *get_sqe(struct io_uring *ring)
650650+{
651651+ struct io_uring_sqe *sqe;
652652+653653+ do {
654654+ sqe = io_uring_get_sqe(ring);
655655+ if (sqe)
656656+ break;
657657+ if (!sqpoll)
658658+ io_uring_submit(ring);
659659+ else
660660+ io_uring_sqring_wait(ring);
661661+ } while (1);
662662+663663+ return sqe;
664664+}
665665+666666+/*
667667+ * See __encode_userdata() for how we encode sqe->user_data, which is passed
668668+ * back as cqe->user_data at completion time.
669669+ */
670670+static void encode_userdata(struct io_uring_sqe *sqe, struct conn *c, int op,
671671+ int bid, int fd)
672672+{
673673+ __encode_userdata(sqe, c->tid, op, bid, fd);
674674+}
675675+676676+static void __submit_receive(struct io_uring *ring, struct conn *c,
677677+ struct conn_dir *cd, int fd)
678678+{
679679+ struct conn_buf_ring *cbr = &c->in_br;
680680+ struct io_uring_sqe *sqe;
681681+682682+ vlog("%d: submit receive fd=%d\n", c->tid, fd);
683683+684684+ assert(!cd->pending_recv);
685685+ cd->pending_recv = 1;
686686+687687+ /*
688688+ * For both recv and multishot receive, we use the ring provided
689689+ * buffers. These are handed to the application ahead of time, and
690690+ * are consumed when a receive triggers. Note that the address and
691691+ * length of the receive are set to NULL/0, and we assign the
692692+ * sqe->buf_group to tell the kernel which buffer group ID to pick
693693+ * a buffer from. Finally, IOSQE_BUFFER_SELECT is set to tell the
694694+ * kernel that we want a buffer picked for this request, we are not
695695+ * passing one in with the request.
696696+ */
697697+ sqe = get_sqe(ring);
698698+ if (rcv_msg) {
699699+ struct io_msg *imsg = &cd->io_rcv_msg;
700700+ struct msghdr *msg = &imsg->msg;
701701+702702+ memset(msg, 0, sizeof(*msg));
703703+ msg->msg_iov = msg_vec(imsg)->iov;
704704+ msg->msg_iovlen = msg_vec(imsg)->iov_len;
705705+706706+ if (recv_mshot) {
707707+ cd->rcv_mshot++;
708708+ io_uring_prep_recvmsg_multishot(sqe, fd, &imsg->msg, 0);
709709+ } else {
710710+ io_uring_prep_recvmsg(sqe, fd, &imsg->msg, 0);
711711+ }
712712+ } else {
713713+ if (recv_mshot) {
714714+ cd->rcv_mshot++;
715715+ io_uring_prep_recv_multishot(sqe, fd, NULL, 0, 0);
716716+ } else {
717717+ io_uring_prep_recv(sqe, fd, NULL, 0, 0);
718718+ }
719719+ }
720720+ encode_userdata(sqe, c, __RECV, 0, fd);
721721+ sqe->buf_group = cbr->bgid;
722722+ sqe->flags |= IOSQE_BUFFER_SELECT;
723723+ if (fixed_files)
724724+ sqe->flags |= IOSQE_FIXED_FILE;
725725+ if (rcv_bundle)
726726+ sqe->ioprio |= IORING_RECVSEND_BUNDLE;
727727+}
728728+729729+/*
730730+ * One directional just arms receive on our in_fd
731731+ */
732732+static void submit_receive(struct io_uring *ring, struct conn *c)
733733+{
734734+ __submit_receive(ring, c, &c->cd[0], c->in_fd);
735735+}
736736+737737+/*
738738+ * Bi-directional arms receive on both in and out fd
739739+ */
740740+static void submit_bidi_receive(struct io_uring *ring, struct conn *c)
741741+{
742742+ __submit_receive(ring, c, &c->cd[0], c->in_fd);
743743+ __submit_receive(ring, c, &c->cd[1], c->out_fd);
744744+}
745745+746746+/*
747747+ * We hit -ENOBUFS, which means that we ran out of buffers in our current
748748+ * provided buffer group. This can happen if there's an imbalance between the
749749+ * receives coming in and the sends being processed, particularly with multishot
750750+ * receive as they can trigger very quickly. If this happens, defer arming a
751751+ * new receive until we've replenished half of the buffer pool by processing
752752+ * pending sends.
753753+ */
754754+static void recv_enobufs(struct io_uring *ring, struct conn *c,
755755+ struct conn_dir *cd, int fd)
756756+{
757757+ vlog("%d: enobufs hit\n", c->tid);
758758+759759+ cd->rcv_enobufs++;
760760+761761+ /*
762762+ * If we're a sink, mark rcv as rearm. If we're not, then mark us as
763763+ * needing a rearm for receive and send. The completing send will
764764+ * kick the recv rearm.
765765+ */
766766+ if (!is_sink) {
767767+ int do_recv_arm = 1;
768768+769769+ if (!cd->pending_send)
770770+ do_recv_arm = !prep_next_send(ring, c, cd, fd);
771771+ if (do_recv_arm)
772772+ __submit_receive(ring, c, &c->cd[0], c->in_fd);
773773+ } else {
774774+ __submit_receive(ring, c, &c->cd[0], c->in_fd);
775775+ }
776776+}
777777+778778+/*
779779+ * Kill this socket - submit a shutdown and link a close to it. We don't
780780+ * care about shutdown status, so mark it as not needing to post a CQE unless
781781+ * it fails.
782782+ */
783783+static void queue_shutdown_close(struct io_uring *ring, struct conn *c, int fd)
784784+{
785785+ struct io_uring_sqe *sqe1, *sqe2;
786786+787787+ /*
788788+ * On the off chance that we run out of SQEs after the first one,
789789+ * grab two upfront. This it to prevent our link not working if
790790+ * get_sqe() ends up doing submissions to free up an SQE, as links
791791+ * are not valid across separate submissions.
792792+ */
793793+ sqe1 = get_sqe(ring);
794794+ sqe2 = get_sqe(ring);
795795+796796+ io_uring_prep_shutdown(sqe1, fd, SHUT_RDWR);
797797+ if (fixed_files)
798798+ sqe1->flags |= IOSQE_FIXED_FILE;
799799+ sqe1->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS;
800800+ encode_userdata(sqe1, c, __SHUTDOWN, 0, fd);
801801+802802+ if (fixed_files)
803803+ io_uring_prep_close_direct(sqe2, fd);
804804+ else
805805+ io_uring_prep_close(sqe2, fd);
806806+ encode_userdata(sqe2, c, __CLOSE, 0, fd);
807807+}
808808+809809+/*
810810+ * This connection is going away, queue a cancel for any pending recv, for
811811+ * example, we have pending for this ring. For completeness, we issue a cancel
812812+ * for any request we have pending for both in_fd and out_fd.
813813+ */
814814+static void queue_cancel(struct io_uring *ring, struct conn *c)
815815+{
816816+ struct io_uring_sqe *sqe;
817817+ int flags = 0;
818818+819819+ if (fixed_files)
820820+ flags |= IORING_ASYNC_CANCEL_FD_FIXED;
821821+822822+ sqe = get_sqe(ring);
823823+ io_uring_prep_cancel_fd(sqe, c->in_fd, flags);
824824+ encode_userdata(sqe, c, __CANCEL, 0, c->in_fd);
825825+ c->pending_cancels++;
826826+827827+ if (c->out_fd != -1) {
828828+ sqe = get_sqe(ring);
829829+ io_uring_prep_cancel_fd(sqe, c->out_fd, flags);
830830+ encode_userdata(sqe, c, __CANCEL, 0, c->out_fd);
831831+ c->pending_cancels++;
832832+ }
833833+834834+ io_uring_submit(ring);
835835+}
836836+837837+static int pending_shutdown(struct conn *c)
838838+{
839839+ return c->cd[0].pending_shutdown + c->cd[1].pending_shutdown;
840840+}
841841+842842+static bool should_shutdown(struct conn *c)
843843+{
844844+ int i;
845845+846846+ if (!pending_shutdown(c))
847847+ return false;
848848+ if (is_sink)
849849+ return true;
850850+ if (!bidi)
851851+ return c->cd[0].in_bytes == c->cd[1].out_bytes;
852852+853853+ for (i = 0; i < 2; i++) {
854854+ if (c->cd[0].rcv != c->cd[1].snd)
855855+ return false;
856856+ if (c->cd[1].rcv != c->cd[0].snd)
857857+ return false;
858858+ }
859859+860860+ return true;
861861+}
862862+863863+/*
864864+ * Close this connection - send a ring message to the connection with intent
865865+ * to stop. When the client gets the message, it will initiate the stop.
866866+ */
867867+static void __close_conn(struct io_uring *ring, struct conn *c)
868868+{
869869+ struct io_uring_sqe *sqe;
870870+ uint64_t user_data;
871871+872872+ printf("Client %d: queueing stop\n", c->tid);
873873+874874+ user_data = __raw_encode(c->tid, __STOP, 0, 0);
875875+ sqe = io_uring_get_sqe(ring);
876876+ io_uring_prep_msg_ring(sqe, c->ring.ring_fd, 0, user_data, 0);
877877+ encode_userdata(sqe, c, __NOP, 0, 0);
878878+ io_uring_submit(ring);
879879+}
880880+881881+static void close_cd(struct conn *c, struct conn_dir *cd)
882882+{
883883+ cd->pending_shutdown = 1;
884884+885885+ if (cd->pending_send)
886886+ return;
887887+888888+ if (!(c->flags & CONN_F_PENDING_SHUTDOWN)) {
889889+ gettimeofday(&c->end_time, NULL);
890890+ c->flags |= CONN_F_PENDING_SHUTDOWN | CONN_F_END_TIME;
891891+ }
892892+}
893893+894894+/*
895895+ * We're done with this buffer, add it back to our pool so the kernel is
896896+ * free to use it again.
897897+ */
898898+static int replenish_buffer(struct conn_buf_ring *cbr, int bid, int offset)
899899+{
900900+ void *this_buf = cbr->buf + bid * buf_size;
901901+902902+ assert(bid < nr_bufs);
903903+904904+ io_uring_buf_ring_add(cbr->br, this_buf, buf_size, bid, br_mask, offset);
905905+ return buf_size;
906906+}
907907+908908+/*
909909+ * Iterate buffers from '*bid' and with a total size of 'bytes' and add them
910910+ * back to our receive ring so they can be reused for new receives.
911911+ */
912912+static int replenish_buffers(struct conn *c, int *bid, int bytes)
913913+{
914914+ struct conn_buf_ring *cbr = &c->in_br;
915915+ int nr_packets = 0;
916916+917917+ while (bytes) {
918918+ int this_len = replenish_buffer(cbr, *bid, nr_packets);
919919+920920+ if (this_len > bytes)
921921+ this_len = bytes;
922922+ bytes -= this_len;
923923+924924+ *bid = (*bid + 1) & (nr_bufs - 1);
925925+ nr_packets++;
926926+ }
927927+928928+ io_uring_buf_ring_advance(cbr->br, nr_packets);
929929+ return nr_packets;
930930+}
931931+932932+static void free_mvec(struct msg_vec *mvec)
933933+{
934934+ free(mvec->iov);
935935+ mvec->iov = NULL;
936936+}
937937+938938+static void init_mvec(struct msg_vec *mvec)
939939+{
940940+ memset(mvec, 0, sizeof(*mvec));
941941+ mvec->iov = malloc(sizeof(struct iovec));
942942+ mvec->vec_size = 1;
943943+}
944944+945945+static void init_msgs(struct conn_dir *cd)
946946+{
947947+ memset(&cd->io_snd_msg, 0, sizeof(cd->io_snd_msg));
948948+ memset(&cd->io_rcv_msg, 0, sizeof(cd->io_rcv_msg));
949949+ init_mvec(&cd->io_snd_msg.vecs[0]);
950950+ init_mvec(&cd->io_snd_msg.vecs[1]);
951951+ init_mvec(&cd->io_rcv_msg.vecs[0]);
952952+}
953953+954954+static void free_msgs(struct conn_dir *cd)
955955+{
956956+ free_mvec(&cd->io_snd_msg.vecs[0]);
957957+ free_mvec(&cd->io_snd_msg.vecs[1]);
958958+ free_mvec(&cd->io_rcv_msg.vecs[0]);
959959+}
960960+961961+/*
962962+ * Multishot accept completion triggered. If we're acting as a sink, we're
963963+ * good to go. Just issue a receive for that case. If we're acting as a proxy,
964964+ * then start opening a socket that we can use to connect to the other end.
965965+ */
966966+static int handle_accept(struct io_uring *ring, struct io_uring_cqe *cqe)
967967+{
968968+ struct conn *c;
969969+ int i;
970970+971971+ if (nr_conns == MAX_CONNS) {
972972+ fprintf(stderr, "max clients reached %d\n", nr_conns);
973973+ return 1;
974974+ }
975975+976976+ /* main thread handles this, which is obviously serialized */
977977+ c = &conns[nr_conns];
978978+ c->tid = nr_conns++;
979979+ c->in_fd = -1;
980980+ c->out_fd = -1;
981981+982982+ for (i = 0; i < 2; i++) {
983983+ struct conn_dir *cd = &c->cd[i];
984984+985985+ cd->index = i;
986986+ cd->snd_next_bid = -1;
987987+ cd->rcv_next_bid = -1;
988988+ if (ext_stat) {
989989+ cd->rcv_bucket = calloc(nr_bufs + 1, sizeof(int));
990990+ cd->snd_bucket = calloc(nr_bufs + 1, sizeof(int));
991991+ }
992992+ init_msgs(cd);
993993+ }
994994+995995+ printf("New client: id=%d, in=%d\n", c->tid, c->in_fd);
996996+ gettimeofday(&c->start_time, NULL);
997997+998998+ pthread_barrier_init(&c->startup_barrier, NULL, 2);
999999+ pthread_create(&c->thread, NULL, thread_main, c);
10001000+10011001+ /*
10021002+ * Wait for thread to have its ring setup, then either assign the fd
10031003+ * if it's non-fixed, or pass the fixed one
10041004+ */
10051005+ pthread_barrier_wait(&c->startup_barrier);
10061006+ if (!fixed_files) {
10071007+ c->in_fd = cqe->res;
10081008+ } else {
10091009+ struct io_uring_sqe *sqe;
10101010+ uint64_t user_data;
10111011+10121012+ /*
10131013+ * Ring has just been setup, we'll use index 0 as the descriptor
10141014+ * value.
10151015+ */
10161016+ user_data = __raw_encode(c->tid, __FD_PASS, 0, 0);
10171017+ sqe = io_uring_get_sqe(ring);
10181018+ io_uring_prep_msg_ring_fd(sqe, c->ring.ring_fd, cqe->res, 0,
10191019+ user_data, 0);
10201020+ encode_userdata(sqe, c, __NOP, 0, cqe->res);
10211021+ }
10221022+10231023+ return 0;
10241024+}
10251025+10261026+/*
10271027+ * Our socket request completed, issue a connect request to the other end.
10281028+ */
10291029+static int handle_sock(struct io_uring *ring, struct io_uring_cqe *cqe)
10301030+{
10311031+ struct conn *c = cqe_to_conn(cqe);
10321032+ struct io_uring_sqe *sqe;
10331033+ int ret;
10341034+10351035+ vlog("%d: sock: res=%d\n", c->tid, cqe->res);
10361036+10371037+ c->out_fd = cqe->res;
10381038+10391039+ if (ipv6) {
10401040+ memset(&c->addr6, 0, sizeof(c->addr6));
10411041+ c->addr6.sin6_family = AF_INET6;
10421042+ c->addr6.sin6_port = htons(send_port);
10431043+ ret = inet_pton(AF_INET6, host, &c->addr6.sin6_addr);
10441044+ } else {
10451045+ memset(&c->addr, 0, sizeof(c->addr));
10461046+ c->addr.sin_family = AF_INET;
10471047+ c->addr.sin_port = htons(send_port);
10481048+ ret = inet_pton(AF_INET, host, &c->addr.sin_addr);
10491049+ }
10501050+ if (ret <= 0) {
10511051+ if (!ret)
10521052+ fprintf(stderr, "host not in right format\n");
10531053+ else
10541054+ perror("inet_pton");
10551055+ return 1;
10561056+ }
10571057+10581058+ sqe = get_sqe(ring);
10591059+ if (ipv6) {
10601060+ io_uring_prep_connect(sqe, c->out_fd,
10611061+ (struct sockaddr *) &c->addr6,
10621062+ sizeof(c->addr6));
10631063+ } else {
10641064+ io_uring_prep_connect(sqe, c->out_fd,
10651065+ (struct sockaddr *) &c->addr,
10661066+ sizeof(c->addr));
10671067+ }
10681068+ encode_userdata(sqe, c, __CONNECT, 0, c->out_fd);
10691069+ if (fixed_files)
10701070+ sqe->flags |= IOSQE_FIXED_FILE;
10711071+ return 0;
10721072+}
10731073+10741074+/*
10751075+ * Connection to the other end is done, submit a receive to start receiving
10761076+ * data. If we're a bidirectional proxy, issue a receive on both ends. If not,
10771077+ * then just a single recv will do.
10781078+ */
10791079+static int handle_connect(struct io_uring *ring, struct io_uring_cqe *cqe)
10801080+{
10811081+ struct conn *c = cqe_to_conn(cqe);
10821082+10831083+ pthread_mutex_lock(&thread_lock);
10841084+ open_conns++;
10851085+ pthread_mutex_unlock(&thread_lock);
10861086+10871087+ if (bidi)
10881088+ submit_bidi_receive(ring, c);
10891089+ else
10901090+ submit_receive(ring, c);
10911091+10921092+ return 0;
10931093+}
10941094+10951095+/*
10961096+ * Append new segment to our currently active msg_vec. This will be submitted
10971097+ * as a sendmsg (with all of it), or as separate sends, later. If we're using
10981098+ * send_ring, then we won't hit this path. Instead, outgoing buffers are
10991099+ * added directly to our outgoing send buffer ring.
11001100+ */
11011101+static void send_append_vec(struct conn_dir *cd, void *data, int len)
11021102+{
11031103+ struct msg_vec *mvec = snd_msg_vec(cd);
11041104+11051105+ if (mvec->iov_len == mvec->vec_size) {
11061106+ mvec->vec_size <<= 1;
11071107+ mvec->iov = realloc(mvec->iov, mvec->vec_size * sizeof(struct iovec));
11081108+ }
11091109+11101110+ mvec->iov[mvec->iov_len].iov_base = data;
11111111+ mvec->iov[mvec->iov_len].iov_len = len;
11121112+ mvec->iov_len++;
11131113+}
11141114+11151115+/*
11161116+ * Queue a send based on the data received in this cqe, which came from
11171117+ * a completed receive operation.
11181118+ */
11191119+static void send_append(struct conn *c, struct conn_dir *cd, void *data,
11201120+ int bid, int len)
11211121+{
11221122+ vlog("%d: send %d (%p, bid %d)\n", c->tid, len, data, bid);
11231123+11241124+ assert(bid < nr_bufs);
11251125+11261126+ /* if using provided buffers for send, add it upfront */
11271127+ if (send_ring) {
11281128+ struct conn_buf_ring *cbr = &c->out_br;
11291129+11301130+ io_uring_buf_ring_add(cbr->br, data, len, bid, br_mask, 0);
11311131+ io_uring_buf_ring_advance(cbr->br, 1);
11321132+ } else {
11331133+ send_append_vec(cd, data, len);
11341134+ }
11351135+}
11361136+11371137+/*
11381138+ * For non recvmsg && multishot, a zero receive marks the end. For recvmsg
11391139+ * with multishot, we always get the header regardless. Hence a "zero receive"
11401140+ * is the size of the header.
11411141+ */
11421142+static int recv_done_res(int res)
11431143+{
11441144+ if (!res)
11451145+ return 1;
11461146+ if (rcv_msg && recv_mshot && res == sizeof(struct io_uring_recvmsg_out))
11471147+ return 1;
11481148+ return 0;
11491149+}
11501150+11511151+/*
11521152+ * Any receive that isn't recvmsg with multishot can be handled the same way.
11531153+ * Iterate from '*bid' and 'in_bytes' in total, and append the data to the
11541154+ * outgoing queue.
11551155+ */
11561156+static int recv_bids(struct conn *c, struct conn_dir *cd, int *bid, int in_bytes)
11571157+{
11581158+ struct conn_buf_ring *cbr = &c->out_br;
11591159+ struct conn_buf_ring *in_cbr = &c->in_br;
11601160+ struct io_uring_buf *buf;
11611161+ int nr_packets = 0;
11621162+11631163+ while (in_bytes) {
11641164+ int this_bytes;
11651165+ void *data;
11661166+11671167+ buf = &in_cbr->br->bufs[*bid];
11681168+ data = (void *) (unsigned long) buf->addr;
11691169+ this_bytes = buf->len;
11701170+ if (this_bytes > in_bytes)
11711171+ this_bytes = in_bytes;
11721172+11731173+ in_bytes -= this_bytes;
11741174+11751175+ if (send_ring)
11761176+ io_uring_buf_ring_add(cbr->br, data, this_bytes, *bid,
11771177+ br_mask, nr_packets);
11781178+ else
11791179+ send_append(c, cd, data, *bid, this_bytes);
11801180+11811181+ *bid = (*bid + 1) & (nr_bufs - 1);
11821182+ nr_packets++;
11831183+ }
11841184+11851185+ if (send_ring)
11861186+ io_uring_buf_ring_advance(cbr->br, nr_packets);
11871187+11881188+ return nr_packets;
11891189+}
11901190+11911191+/*
11921192+ * Special handling of recvmsg with multishot
11931193+ */
11941194+static int recv_mshot_msg(struct conn *c, struct conn_dir *cd, int *bid,
11951195+ int in_bytes)
11961196+{
11971197+ struct conn_buf_ring *cbr = &c->out_br;
11981198+ struct conn_buf_ring *in_cbr = &c->in_br;
11991199+ struct io_uring_buf *buf;
12001200+ int nr_packets = 0;
12011201+12021202+ while (in_bytes) {
12031203+ struct io_uring_recvmsg_out *pdu;
12041204+ int this_bytes;
12051205+ void *data;
12061206+12071207+ buf = &in_cbr->br->bufs[*bid];
12081208+12091209+ /*
12101210+ * multishot recvmsg puts a header in front of the data - we
12111211+ * have to take that into account for the send setup, and
12121212+ * adjust the actual data read to not take this metadata into
12131213+ * account. For this use case, namelen and controllen will not
12141214+ * be set. If they were, they would need to be factored in too.
12151215+ */
12161216+ buf->len -= sizeof(struct io_uring_recvmsg_out);
12171217+ in_bytes -= sizeof(struct io_uring_recvmsg_out);
12181218+12191219+ pdu = (void *) (unsigned long) buf->addr;
12201220+ vlog("pdu namelen %d, controllen %d, payload %d flags %x\n",
12211221+ pdu->namelen, pdu->controllen, pdu->payloadlen,
12221222+ pdu->flags);
12231223+ data = (void *) (pdu + 1);
12241224+12251225+ this_bytes = pdu->payloadlen;
12261226+ if (this_bytes > in_bytes)
12271227+ this_bytes = in_bytes;
12281228+12291229+ in_bytes -= this_bytes;
12301230+12311231+ if (send_ring)
12321232+ io_uring_buf_ring_add(cbr->br, data, this_bytes, *bid,
12331233+ br_mask, nr_packets);
12341234+ else
12351235+ send_append(c, cd, data, *bid, this_bytes);
12361236+12371237+ *bid = (*bid + 1) & (nr_bufs - 1);
12381238+ nr_packets++;
12391239+ }
12401240+12411241+ if (send_ring)
12421242+ io_uring_buf_ring_advance(cbr->br, nr_packets);
12431243+12441244+ return nr_packets;
12451245+}
12461246+12471247+static int __handle_recv(struct io_uring *ring, struct conn *c,
12481248+ struct conn_dir *cd, struct io_uring_cqe *cqe)
12491249+{
12501250+ struct conn_dir *ocd = &c->cd[!cd->index];
12511251+ int bid, nr_packets;
12521252+12531253+ /*
12541254+ * Not having a buffer attached should only happen if we get a zero
12551255+ * sized receive, because the other end closed the connection. It
12561256+ * cannot happen otherwise, as all our receives are using provided
12571257+ * buffers and hence it's not possible to return a CQE with a non-zero
12581258+ * result and not have a buffer attached.
12591259+ */
12601260+ if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
12611261+ cd->pending_recv = 0;
12621262+12631263+ if (!recv_done_res(cqe->res)) {
12641264+ fprintf(stderr, "no buffer assigned, res=%d\n", cqe->res);
12651265+ return 1;
12661266+ }
12671267+start_close:
12681268+ prep_next_send(ring, c, ocd, other_dir_fd(c, cqe_to_fd(cqe)));
12691269+ close_cd(c, cd);
12701270+ return 0;
12711271+ }
12721272+12731273+ if (cqe->res && cqe->res < buf_size)
12741274+ cd->rcv_shrt++;
12751275+12761276+ bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT;
12771277+12781278+ /*
12791279+ * BIDI will use the same buffer pool and do receive on both CDs,
12801280+ * so can't reliably check. TODO.
12811281+ */
12821282+ if (!bidi && cd->rcv_next_bid != -1 && bid != cd->rcv_next_bid) {
12831283+ fprintf(stderr, "recv bid %d, wanted %d\n", bid, cd->rcv_next_bid);
12841284+ goto start_close;
12851285+ }
12861286+12871287+ vlog("%d: recv: bid=%d, res=%d, cflags=%x\n", c->tid, bid, cqe->res, cqe->flags);
12881288+ /*
12891289+ * If we're a sink, we're done here. Just replenish the buffer back
12901290+ * to the pool. For proxy mode, we will send the data to the other
12911291+ * end and the buffer will be replenished once the send is done with
12921292+ * it.
12931293+ */
12941294+ if (is_sink)
12951295+ nr_packets = replenish_buffers(c, &bid, cqe->res);
12961296+ else if (rcv_msg && recv_mshot)
12971297+ nr_packets = recv_mshot_msg(c, ocd, &bid, cqe->res);
12981298+ else
12991299+ nr_packets = recv_bids(c, ocd, &bid, cqe->res);
13001300+13011301+ if (cd->rcv_bucket)
13021302+ cd->rcv_bucket[nr_packets]++;
13031303+13041304+ if (!is_sink) {
13051305+ ocd->out_buffers += nr_packets;
13061306+ assert(ocd->out_buffers <= nr_bufs);
13071307+ }
13081308+13091309+ cd->rcv++;
13101310+ cd->rcv_next_bid = bid;
13111311+13121312+ /*
13131313+ * If IORING_CQE_F_MORE isn't set, then this is either a normal recv
13141314+ * that needs rearming, or it's a multishot that won't post any further
13151315+ * completions. Setup a new one for these cases.
13161316+ */
13171317+ if (!(cqe->flags & IORING_CQE_F_MORE)) {
13181318+ cd->pending_recv = 0;
13191319+ if (recv_done_res(cqe->res))
13201320+ goto start_close;
13211321+ if (is_sink)
13221322+ __submit_receive(ring, c, &c->cd[0], c->in_fd);
13231323+ }
13241324+13251325+ /*
13261326+ * Submit a send if we won't get anymore notifications from this
13271327+ * recv, or if we have nr_bufs / 2 queued up. If BIDI mode, send
13281328+ * every buffer. We assume this is interactive mode, and hence don't
13291329+ * delay anything.
13301330+ */
13311331+ if (((!ocd->pending_send && (bidi || (ocd->out_buffers >= nr_bufs / 2))) ||
13321332+ !(cqe->flags & IORING_CQE_F_MORE)) && !is_sink)
13331333+ prep_next_send(ring, c, ocd, other_dir_fd(c, cqe_to_fd(cqe)));
13341334+13351335+ if (!recv_done_res(cqe->res))
13361336+ cd->in_bytes += cqe->res;
13371337+ return 0;
13381338+}
13391339+13401340+static int handle_recv(struct io_uring *ring, struct io_uring_cqe *cqe)
13411341+{
13421342+ struct conn *c = cqe_to_conn(cqe);
13431343+ struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
13441344+13451345+ return __handle_recv(ring, c, cd, cqe);
13461346+}
13471347+13481348+static int recv_error(struct error_handler *err, struct io_uring *ring,
13491349+ struct io_uring_cqe *cqe)
13501350+{
13511351+ struct conn *c = cqe_to_conn(cqe);
13521352+ struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
13531353+13541354+ cd->pending_recv = 0;
13551355+13561356+ if (cqe->res != -ENOBUFS)
13571357+ return default_error(err, ring, cqe);
13581358+13591359+ recv_enobufs(ring, c, cd, other_dir_fd(c, cqe_to_fd(cqe)));
13601360+ return 0;
13611361+}
13621362+13631363+static void submit_send(struct io_uring *ring, struct conn *c,
13641364+ struct conn_dir *cd, int fd, void *data, int len,
13651365+ int bid, int flags)
13661366+{
13671367+ struct io_uring_sqe *sqe;
13681368+ int bgid = c->out_br.bgid;
13691369+13701370+ if (cd->pending_send)
13711371+ return;
13721372+ cd->pending_send = 1;
13731373+13741374+ flags |= MSG_WAITALL | MSG_NOSIGNAL;
13751375+13761376+ sqe = get_sqe(ring);
13771377+ if (snd_msg) {
13781378+ struct io_msg *imsg = &cd->io_snd_msg;
13791379+13801380+ if (snd_zc) {
13811381+ io_uring_prep_sendmsg_zc(sqe, fd, &imsg->msg, flags);
13821382+ cd->snd_notif++;
13831383+ } else {
13841384+ io_uring_prep_sendmsg(sqe, fd, &imsg->msg, flags);
13851385+ }
13861386+ } else if (send_ring) {
13871387+ io_uring_prep_send(sqe, fd, NULL, 0, flags);
13881388+ } else if (!snd_zc) {
13891389+ io_uring_prep_send(sqe, fd, data, len, flags);
13901390+ } else {
13911391+ io_uring_prep_send_zc(sqe, fd, data, len, flags, 0);
13921392+ sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
13931393+ sqe->buf_index = bid;
13941394+ cd->snd_notif++;
13951395+ }
13961396+ encode_userdata(sqe, c, __SEND, bid, fd);
13971397+ if (fixed_files)
13981398+ sqe->flags |= IOSQE_FIXED_FILE;
13991399+ if (send_ring) {
14001400+ sqe->flags |= IOSQE_BUFFER_SELECT;
14011401+ sqe->buf_group = bgid;
14021402+ }
14031403+ if (snd_bundle) {
14041404+ sqe->ioprio |= IORING_RECVSEND_BUNDLE;
14051405+ cd->snd_mshot++;
14061406+ } else if (send_ring)
14071407+ cd->snd_mshot++;
14081408+}
14091409+14101410+/*
14111411+ * Prepare the next send request, if we need to. If one is already pending,
14121412+ * or if we're a sink and we don't need to do sends, then there's nothing
14131413+ * to do.
14141414+ *
14151415+ * Return 1 if another send completion is expected, 0 if not.
14161416+ */
14171417+static int prep_next_send(struct io_uring *ring, struct conn *c,
14181418+ struct conn_dir *cd, int fd)
14191419+{
14201420+ int bid;
14211421+14221422+ if (cd->pending_send || is_sink)
14231423+ return 0;
14241424+ if (!cd->out_buffers)
14251425+ return 0;
14261426+14271427+ bid = cd->snd_next_bid;
14281428+ if (bid == -1)
14291429+ bid = 0;
14301430+14311431+ if (send_ring) {
14321432+ /*
14331433+ * send_ring mode is easy, there's nothing to do but submit
14341434+ * our next send request. That will empty the entire outgoing
14351435+ * queue.
14361436+ */
14371437+ submit_send(ring, c, cd, fd, NULL, 0, bid, 0);
14381438+ return 1;
14391439+ } else if (snd_msg) {
14401440+ /*
14411441+ * For sendmsg mode, submit our currently prepared iovec, if
14421442+ * we have one, and swap our iovecs so that any further
14431443+ * receives will start preparing that one.
14441444+ */
14451445+ struct io_msg *imsg = &cd->io_snd_msg;
14461446+14471447+ if (!msg_vec(imsg)->iov_len)
14481448+ return 0;
14491449+ imsg->msg.msg_iov = msg_vec(imsg)->iov;
14501450+ imsg->msg.msg_iovlen = msg_vec(imsg)->iov_len;
14511451+ msg_vec(imsg)->iov_len = 0;
14521452+ imsg->vec_index = !imsg->vec_index;
14531453+ submit_send(ring, c, cd, fd, NULL, 0, bid, 0);
14541454+ return 1;
14551455+ } else {
14561456+ /*
14571457+ * send without send_ring - submit the next available vec,
14581458+ * if any. If this vec is the last one in the current series,
14591459+ * then swap to the next vec. We flag each send with MSG_MORE,
14601460+ * unless this is the last part of the current vec.
14611461+ */
14621462+ struct io_msg *imsg = &cd->io_snd_msg;
14631463+ struct msg_vec *mvec = msg_vec(imsg);
14641464+ int flags = !snd_zc ? MSG_MORE : 0;
14651465+ struct iovec *iov;
14661466+14671467+ if (mvec->iov_len == mvec->cur_iov)
14681468+ return 0;
14691469+ imsg->msg.msg_iov = msg_vec(imsg)->iov;
14701470+ iov = &mvec->iov[mvec->cur_iov];
14711471+ mvec->cur_iov++;
14721472+ if (mvec->cur_iov == mvec->iov_len) {
14731473+ mvec->iov_len = 0;
14741474+ mvec->cur_iov = 0;
14751475+ imsg->vec_index = !imsg->vec_index;
14761476+ flags = 0;
14771477+ }
14781478+ submit_send(ring, c, cd, fd, iov->iov_base, iov->iov_len, bid, flags);
14791479+ return 1;
14801480+ }
14811481+}
14821482+14831483+/*
14841484+ * Handling a send with an outgoing send ring. Get the buffers from the
14851485+ * receive side, and add them to the ingoing buffer ring again.
14861486+ */
14871487+static int handle_send_ring(struct conn *c, struct conn_dir *cd,
14881488+ int bid, int bytes)
14891489+{
14901490+ struct conn_buf_ring *in_cbr = &c->in_br;
14911491+ struct conn_buf_ring *out_cbr = &c->out_br;
14921492+ int i = 0;
14931493+14941494+ while (bytes) {
14951495+ struct io_uring_buf *buf = &out_cbr->br->bufs[bid];
14961496+ int this_bytes;
14971497+ void *this_buf;
14981498+14991499+ this_bytes = buf->len;
15001500+ if (this_bytes > bytes)
15011501+ this_bytes = bytes;
15021502+15031503+ cd->out_bytes += this_bytes;
15041504+15051505+ vlog("%d: send: bid=%d, len=%d\n", c->tid, bid, this_bytes);
15061506+15071507+ this_buf = in_cbr->buf + bid * buf_size;
15081508+ io_uring_buf_ring_add(in_cbr->br, this_buf, buf_size, bid, br_mask, i);
15091509+ /*
15101510+ * Find the provided buffer that the receive consumed, and
15111511+ * which we then used for the send, and add it back to the
15121512+ * pool so it can get picked by another receive. Once the send
15131513+ * is done, we're done with it.
15141514+ */
15151515+ bid = (bid + 1) & (nr_bufs - 1);
15161516+ bytes -= this_bytes;
15171517+ i++;
15181518+ }
15191519+ cd->snd_next_bid = bid;
15201520+ io_uring_buf_ring_advance(in_cbr->br, i);
15211521+15221522+ if (pending_shutdown(c))
15231523+ close_cd(c, cd);
15241524+15251525+ return i;
15261526+}
15271527+15281528+/*
15291529+ * sendmsg, or send without a ring. Just add buffers back to the ingoing
15301530+ * ring for receives.
15311531+ */
15321532+static int handle_send_buf(struct conn *c, struct conn_dir *cd, int bid,
15331533+ int bytes)
15341534+{
15351535+ struct conn_buf_ring *in_cbr = &c->in_br;
15361536+ int i = 0;
15371537+15381538+ while (bytes) {
15391539+ struct io_uring_buf *buf = &in_cbr->br->bufs[bid];
15401540+ int this_bytes;
15411541+15421542+ this_bytes = bytes;
15431543+ if (this_bytes > buf->len)
15441544+ this_bytes = buf->len;
15451545+15461546+ vlog("%d: send: bid=%d, len=%d\n", c->tid, bid, this_bytes);
15471547+15481548+ cd->out_bytes += this_bytes;
15491549+ /* each recvmsg mshot package has this overhead */
15501550+ if (rcv_msg && recv_mshot)
15511551+ cd->out_bytes += sizeof(struct io_uring_recvmsg_out);
15521552+ replenish_buffer(in_cbr, bid, i);
15531553+ bid = (bid + 1) & (nr_bufs - 1);
15541554+ bytes -= this_bytes;
15551555+ i++;
15561556+ }
15571557+ io_uring_buf_ring_advance(in_cbr->br, i);
15581558+ cd->snd_next_bid = bid;
15591559+ return i;
15601560+}
15611561+15621562+static int __handle_send(struct io_uring *ring, struct conn *c,
15631563+ struct conn_dir *cd, struct io_uring_cqe *cqe)
15641564+{
15651565+ struct conn_dir *ocd;
15661566+ int bid, nr_packets;
15671567+15681568+ if (send_ring) {
15691569+ if (!(cqe->flags & IORING_CQE_F_BUFFER)) {
15701570+ fprintf(stderr, "no buffer in send?! %d\n", cqe->res);
15711571+ return 1;
15721572+ }
15731573+ bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT;
15741574+ } else {
15751575+ bid = cqe_to_bid(cqe);
15761576+ }
15771577+15781578+ /*
15791579+ * CQE notifications only happen with send/sendmsg zerocopy. They
15801580+ * tell us that the data has been acked, and that hence the buffer
15811581+ * is now free to reuse. Waiting on an ACK for each packet will slow
15821582+ * us down tremendously, so do all of our sends and then wait for
15831583+ * the ACKs to come in. They tend to come in bundles anyway. Once
15841584+ * all acks are done (cd->snd_notif == 0), then fire off the next
15851585+ * receive.
15861586+ */
15871587+ if (cqe->flags & IORING_CQE_F_NOTIF) {
15881588+ cd->snd_notif--;
15891589+ } else {
15901590+ if (cqe->res && cqe->res < buf_size)
15911591+ cd->snd_shrt++;
15921592+15931593+ /*
15941594+ * BIDI will use the same buffer pool and do sends on both CDs,
15951595+ * so can't reliably check. TODO.
15961596+ */
15971597+ if (!bidi && send_ring && cd->snd_next_bid != -1 &&
15981598+ bid != cd->snd_next_bid) {
15991599+ fprintf(stderr, "send bid %d, wanted %d at %lu\n", bid,
16001600+ cd->snd_next_bid, cd->out_bytes);
16011601+ goto out_close;
16021602+ }
16031603+16041604+ assert(bid <= nr_bufs);
16051605+16061606+ vlog("send: got %d, %lu\n", cqe->res, cd->out_bytes);
16071607+16081608+ if (send_ring)
16091609+ nr_packets = handle_send_ring(c, cd, bid, cqe->res);
16101610+ else
16111611+ nr_packets = handle_send_buf(c, cd, bid, cqe->res);
16121612+16131613+ if (cd->snd_bucket)
16141614+ cd->snd_bucket[nr_packets]++;
16151615+16161616+ cd->out_buffers -= nr_packets;
16171617+ assert(cd->out_buffers >= 0);
16181618+16191619+ cd->snd++;
16201620+ }
16211621+16221622+ if (!(cqe->flags & IORING_CQE_F_MORE)) {
16231623+ int do_recv_arm;
16241624+16251625+ cd->pending_send = 0;
16261626+16271627+ /*
16281628+ * send done - see if the current vec has data to submit, and
16291629+ * do so if it does. if it doesn't have data yet, nothing to
16301630+ * do.
16311631+ */
16321632+ do_recv_arm = !prep_next_send(ring, c, cd, cqe_to_fd(cqe));
16331633+16341634+ ocd = &c->cd[!cd->index];
16351635+ if (!cd->snd_notif && do_recv_arm && !ocd->pending_recv) {
16361636+ int fd = other_dir_fd(c, cqe_to_fd(cqe));
16371637+16381638+ __submit_receive(ring, c, ocd, fd);
16391639+ }
16401640+out_close:
16411641+ if (pending_shutdown(c))
16421642+ close_cd(c, cd);
16431643+ }
16441644+16451645+ vlog("%d: pending sends %d\n", c->tid, cd->pending_send);
16461646+ return 0;
16471647+}
16481648+16491649+static int handle_send(struct io_uring *ring, struct io_uring_cqe *cqe)
16501650+{
16511651+ struct conn *c = cqe_to_conn(cqe);
16521652+ struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
16531653+16541654+ return __handle_send(ring, c, cd, cqe);
16551655+}
16561656+16571657+static int send_error(struct error_handler *err, struct io_uring *ring,
16581658+ struct io_uring_cqe *cqe)
16591659+{
16601660+ struct conn *c = cqe_to_conn(cqe);
16611661+ struct conn_dir *cd = cqe_to_conn_dir(c, cqe);
16621662+16631663+ cd->pending_send = 0;
16641664+16651665+ /* res can have high bit set */
16661666+ if (cqe->flags & IORING_CQE_F_NOTIF)
16671667+ return handle_send(ring, cqe);
16681668+ if (cqe->res != -ENOBUFS)
16691669+ return default_error(err, ring, cqe);
16701670+16711671+ cd->snd_enobufs++;
16721672+ return 0;
16731673+}
16741674+16751675+/*
16761676+ * We don't expect to get here, as we marked it with skipping posting a
16771677+ * CQE if it was successful. If it does trigger, than means it fails and
16781678+ * that our close has not been done. Log the shutdown error and issue a new
16791679+ * separate close.
16801680+ */
16811681+static int handle_shutdown(struct io_uring *ring, struct io_uring_cqe *cqe)
16821682+{
16831683+ struct conn *c = cqe_to_conn(cqe);
16841684+ struct io_uring_sqe *sqe;
16851685+ int fd = cqe_to_fd(cqe);
16861686+16871687+ fprintf(stderr, "Got shutdown notication on fd %d\n", fd);
16881688+16891689+ if (!cqe->res)
16901690+ fprintf(stderr, "Unexpected success shutdown CQE\n");
16911691+ else if (cqe->res < 0)
16921692+ fprintf(stderr, "Shutdown got %s\n", strerror(-cqe->res));
16931693+16941694+ sqe = get_sqe(ring);
16951695+ if (fixed_files)
16961696+ io_uring_prep_close_direct(sqe, fd);
16971697+ else
16981698+ io_uring_prep_close(sqe, fd);
16991699+ encode_userdata(sqe, c, __CLOSE, 0, fd);
17001700+ return 0;
17011701+}
17021702+17031703+/*
17041704+ * Final stage of a connection, the shutdown and close has finished. Mark
17051705+ * it as disconnected and let the main loop reap it.
17061706+ */
17071707+static int handle_close(struct io_uring *ring, struct io_uring_cqe *cqe)
17081708+{
17091709+ struct conn *c = cqe_to_conn(cqe);
17101710+ int fd = cqe_to_fd(cqe);
17111711+17121712+ printf("Closed client: id=%d, in_fd=%d, out_fd=%d\n", c->tid, c->in_fd, c->out_fd);
17131713+ if (fd == c->in_fd)
17141714+ c->in_fd = -1;
17151715+ else if (fd == c->out_fd)
17161716+ c->out_fd = -1;
17171717+17181718+ if (c->in_fd == -1 && c->out_fd == -1) {
17191719+ c->flags |= CONN_F_DISCONNECTED;
17201720+17211721+ pthread_mutex_lock(&thread_lock);
17221722+ __show_stats(c);
17231723+ open_conns--;
17241724+ pthread_mutex_unlock(&thread_lock);
17251725+ free_buffer_rings(ring, c);
17261726+ free_msgs(&c->cd[0]);
17271727+ free_msgs(&c->cd[1]);
17281728+ free(c->cd[0].rcv_bucket);
17291729+ free(c->cd[0].snd_bucket);
17301730+ }
17311731+17321732+ return 0;
17331733+}
17341734+17351735+static int handle_cancel(struct io_uring *ring, struct io_uring_cqe *cqe)
17361736+{
17371737+ struct conn *c = cqe_to_conn(cqe);
17381738+ int fd = cqe_to_fd(cqe);
17391739+17401740+ c->pending_cancels--;
17411741+17421742+ vlog("%d: got cancel fd %d, refs %d\n", c->tid, fd, c->pending_cancels);
17431743+17441744+ if (!c->pending_cancels) {
17451745+ queue_shutdown_close(ring, c, c->in_fd);
17461746+ if (c->out_fd != -1)
17471747+ queue_shutdown_close(ring, c, c->out_fd);
17481748+ io_uring_submit(ring);
17491749+ }
17501750+17511751+ return 0;
17521752+}
17531753+17541754+static void open_socket(struct conn *c)
17551755+{
17561756+ if (is_sink) {
17571757+ pthread_mutex_lock(&thread_lock);
17581758+ open_conns++;
17591759+ pthread_mutex_unlock(&thread_lock);
17601760+17611761+ submit_receive(&c->ring, c);
17621762+ } else {
17631763+ struct io_uring_sqe *sqe;
17641764+ int domain;
17651765+17661766+ if (ipv6)
17671767+ domain = AF_INET6;
17681768+ else
17691769+ domain = AF_INET;
17701770+17711771+ /*
17721772+ * If fixed_files is set, proxy will use fixed files for any new
17731773+ * file descriptors it instantiates. Fixd files, or fixed
17741774+ * descriptors, are io_uring private file descriptors. They
17751775+ * cannot be accessed outside of io_uring. io_uring holds a
17761776+ * fixed reference to them, which means that we do not need to
17771777+ * grab per-request references to them. Particularly for
17781778+ * threaded applications, grabbing and dropping file references
17791779+ * for each operation can be costly as the file table is shared.
17801780+ * This generally shows up as fget/fput related overhead in any
17811781+ * workload profiles.
17821782+ *
17831783+ * Fixed descriptors are passed in via the 'fd' field just like
17841784+ * regular descriptors, and then marked as such by setting the
17851785+ * IOSQE_FIXED_FILE flag in the sqe->flags field. Some helpers
17861786+ * do that automatically, like the below, others will need it
17871787+ * set manually if they don't have a *direct*() helper.
17881788+ *
17891789+ * For operations that instantiate them, like the opening of a
17901790+ * direct socket, the application may either ask the kernel to
17911791+ * find a free one (as is done below), or the application may
17921792+ * manage the space itself and pass in an index for a currently
17931793+ * free slot in the table. If the kernel is asked to allocate a
17941794+ * free direct descriptor, note that io_uring does not abide by
17951795+ * the POSIX mandated "lowest free must be returned". It may
17961796+ * return any free descriptor of its choosing.
17971797+ */
17981798+ sqe = get_sqe(&c->ring);
17991799+ if (fixed_files)
18001800+ io_uring_prep_socket_direct_alloc(sqe, domain, SOCK_STREAM, 0, 0);
18011801+ else
18021802+ io_uring_prep_socket(sqe, domain, SOCK_STREAM, 0, 0);
18031803+ encode_userdata(sqe, c, __SOCK, 0, 0);
18041804+ }
18051805+}
18061806+18071807+/*
18081808+ * Start of connection, we got our in descriptor.
18091809+ */
18101810+static int handle_fd_pass(struct io_uring_cqe *cqe)
18111811+{
18121812+ struct conn *c = cqe_to_conn(cqe);
18131813+ int fd = cqe_to_fd(cqe);
18141814+18151815+ vlog("%d: got fd pass %d\n", c->tid, fd);
18161816+ c->in_fd = fd;
18171817+ open_socket(c);
18181818+ return 0;
18191819+}
18201820+18211821+static int handle_stop(struct io_uring_cqe *cqe)
18221822+{
18231823+ struct conn *c = cqe_to_conn(cqe);
18241824+18251825+ printf("Client %d: queueing shutdown\n", c->tid);
18261826+ queue_cancel(&c->ring, c);
18271827+ return 0;
18281828+}
18291829+18301830+/*
18311831+ * Called for each CQE that we receive. Decode the request type that it
18321832+ * came from, and call the appropriate handler.
18331833+ */
18341834+static int handle_cqe(struct io_uring *ring, struct io_uring_cqe *cqe)
18351835+{
18361836+ int ret;
18371837+18381838+ /*
18391839+ * Unlikely, but there's an error in this CQE. If an error handler
18401840+ * is defined, call it, and that will deal with it. If no error
18411841+ * handler is defined, the opcode handler either doesn't care or will
18421842+ * handle it on its own.
18431843+ */
18441844+ if (cqe->res < 0) {
18451845+ struct error_handler *err = &error_handlers[cqe_to_op(cqe)];
18461846+18471847+ if (err->error_fn)
18481848+ return err->error_fn(err, ring, cqe);
18491849+ }
18501850+18511851+ switch (cqe_to_op(cqe)) {
18521852+ case __ACCEPT:
18531853+ ret = handle_accept(ring, cqe);
18541854+ break;
18551855+ case __SOCK:
18561856+ ret = handle_sock(ring, cqe);
18571857+ break;
18581858+ case __CONNECT:
18591859+ ret = handle_connect(ring, cqe);
18601860+ break;
18611861+ case __RECV:
18621862+ case __RECVMSG:
18631863+ ret = handle_recv(ring, cqe);
18641864+ break;
18651865+ case __SEND:
18661866+ case __SENDMSG:
18671867+ ret = handle_send(ring, cqe);
18681868+ break;
18691869+ case __CANCEL:
18701870+ ret = handle_cancel(ring, cqe);
18711871+ break;
18721872+ case __SHUTDOWN:
18731873+ ret = handle_shutdown(ring, cqe);
18741874+ break;
18751875+ case __CLOSE:
18761876+ ret = handle_close(ring, cqe);
18771877+ break;
18781878+ case __FD_PASS:
18791879+ ret = handle_fd_pass(cqe);
18801880+ break;
18811881+ case __STOP:
18821882+ ret = handle_stop(cqe);
18831883+ break;
18841884+ case __NOP:
18851885+ ret = 0;
18861886+ break;
18871887+ default:
18881888+ fprintf(stderr, "bad user data %lx\n", (long) cqe->user_data);
18891889+ return 1;
18901890+ }
18911891+18921892+ return ret;
18931893+}
18941894+18951895+static void house_keeping(struct io_uring *ring)
18961896+{
18971897+ static unsigned long last_bytes;
18981898+ unsigned long bytes, elapsed;
18991899+ struct conn *c;
19001900+ int i, j;
19011901+19021902+ vlog("House keeping entered\n");
19031903+19041904+ bytes = 0;
19051905+ for (i = 0; i < nr_conns; i++) {
19061906+ c = &conns[i];
19071907+19081908+ for (j = 0; j < 2; j++) {
19091909+ struct conn_dir *cd = &c->cd[j];
19101910+19111911+ bytes += cd->in_bytes + cd->out_bytes;
19121912+ }
19131913+ if (c->flags & CONN_F_DISCONNECTED) {
19141914+ vlog("%d: disconnected\n", i);
19151915+19161916+ if (!(c->flags & CONN_F_REAPED)) {
19171917+ void *ret;
19181918+19191919+ pthread_join(c->thread, &ret);
19201920+ c->flags |= CONN_F_REAPED;
19211921+ }
19221922+ continue;
19231923+ }
19241924+ if (c->flags & CONN_F_DISCONNECTING)
19251925+ continue;
19261926+19271927+ if (should_shutdown(c)) {
19281928+ __close_conn(ring, c);
19291929+ c->flags |= CONN_F_DISCONNECTING;
19301930+ }
19311931+ }
19321932+19331933+ elapsed = mtime_since_now(&last_housekeeping);
19341934+ if (bytes && elapsed >= 900) {
19351935+ unsigned long bw;
19361936+19371937+ bw = (8 * (bytes - last_bytes) / 1000UL) / elapsed;
19381938+ if (bw) {
19391939+ if (open_conns)
19401940+ printf("Bandwidth (threads=%d): %'luMbit\n", open_conns, bw);
19411941+ gettimeofday(&last_housekeeping, NULL);
19421942+ last_bytes = bytes;
19431943+ }
19441944+ }
19451945+}
19461946+19471947+/*
19481948+ * Event loop shared between the parent, and the connections. Could be
19491949+ * split in two, as they don't handle the same types of events. For the per
19501950+ * connection loop, 'c' is valid. For the main loop, it's NULL.
19511951+ */
19521952+static int __event_loop(struct io_uring *ring, struct conn *c)
19531953+{
19541954+ struct __kernel_timespec active_ts, idle_ts;
19551955+ int flags;
19561956+19571957+ idle_ts.tv_sec = 0;
19581958+ idle_ts.tv_nsec = 100000000LL;
19591959+ active_ts = idle_ts;
19601960+ if (wait_usec > 1000000) {
19611961+ active_ts.tv_sec = wait_usec / 1000000;
19621962+ wait_usec -= active_ts.tv_sec * 1000000;
19631963+ }
19641964+ active_ts.tv_nsec = wait_usec * 1000;
19651965+19661966+ gettimeofday(&last_housekeeping, NULL);
19671967+19681968+ flags = 0;
19691969+ while (1) {
19701970+ struct __kernel_timespec *ts = &idle_ts;
19711971+ struct io_uring_cqe *cqe;
19721972+ unsigned int head;
19731973+ int ret, i, to_wait;
19741974+19751975+ /*
19761976+ * If wait_batch is set higher than 1, then we'll wait on
19771977+ * that amount of CQEs to be posted each loop. If used with
19781978+ * DEFER_TASKRUN, this can provide a substantial reduction
19791979+ * in context switch rate as the task isn't woken until the
19801980+ * requested number of events can be returned.
19811981+ *
19821982+ * Can be used with -t to set a wait_usec timeout as well.
19831983+ * For example, if an application can deal with 250 usec
19841984+ * of wait latencies, it can set -w8 -t250 which will cause
19851985+ * io_uring to return when either 8 events have been received,
19861986+ * or if 250 usec of waiting has passed.
19871987+ *
19881988+ * If we don't have any open connections, wait on just 1
19891989+ * always.
19901990+ */
19911991+ to_wait = 1;
19921992+ if (open_conns && !flags) {
19931993+ ts = &active_ts;
19941994+ to_wait = wait_batch;
19951995+ }
19961996+19971997+ vlog("Submit and wait for %d\n", to_wait);
19981998+ ret = io_uring_submit_and_wait_timeout(ring, &cqe, to_wait, ts, NULL);
19991999+20002000+ if (*ring->cq.koverflow)
20012001+ printf("overflow %u\n", *ring->cq.koverflow);
20022002+ if (*ring->sq.kflags & IORING_SQ_CQ_OVERFLOW)
20032003+ printf("saw overflow\n");
20042004+20052005+ vlog("Submit and wait: %d\n", ret);
20062006+20072007+ i = flags = 0;
20082008+ io_uring_for_each_cqe(ring, head, cqe) {
20092009+ if (handle_cqe(ring, cqe))
20102010+ return 1;
20112011+ flags |= cqe_to_conn(cqe)->flags;
20122012+ ++i;
20132013+ }
20142014+20152015+ vlog("Handled %d events\n", i);
20162016+20172017+ /*
20182018+ * Advance the CQ ring for seen events when we've processed
20192019+ * all of them in this loop. This can also be done with
20202020+ * io_uring_cqe_seen() in each handler above, which just marks
20212021+ * that single CQE as seen. However, it's more efficient to
20222022+ * mark a batch as seen when we're done with that batch.
20232023+ */
20242024+ if (i) {
20252025+ io_uring_cq_advance(ring, i);
20262026+ events += i;
20272027+ }
20282028+20292029+ event_loops++;
20302030+ if (c) {
20312031+ if (c->flags & CONN_F_DISCONNECTED)
20322032+ break;
20332033+ } else {
20342034+ house_keeping(ring);
20352035+ }
20362036+ }
20372037+20382038+ return 0;
20392039+}
20402040+20412041+/*
20422042+ * Main event loop, Submit our multishot accept request, and then just loop
20432043+ * around handling incoming connections.
20442044+ */
20452045+static int parent_loop(struct io_uring *ring, int fd)
20462046+{
20472047+ struct io_uring_sqe *sqe;
20482048+20492049+ /*
20502050+ * proxy provides a way to use either multishot receive or not, but
20512051+ * for accept, we always use multishot. A multishot accept request
20522052+ * needs only be armed once, and then it'll trigger a completion and
20532053+ * post a CQE whenever a new connection is accepted. No need to do
20542054+ * anything else, unless the multishot accept terminates. This happens
20552055+ * if it encounters an error. Applications should check for
20562056+ * IORING_CQE_F_MORE in cqe->flags - this tells you if more completions
20572057+ * are expected from this request or not. Non-multishot never have
20582058+ * this set, where multishot will always have this set unless an error
20592059+ * occurs.
20602060+ */
20612061+ sqe = get_sqe(ring);
20622062+ if (fixed_files)
20632063+ io_uring_prep_multishot_accept_direct(sqe, fd, NULL, NULL, 0);
20642064+ else
20652065+ io_uring_prep_multishot_accept(sqe, fd, NULL, NULL, 0);
20662066+ __encode_userdata(sqe, 0, __ACCEPT, 0, fd);
20672067+20682068+ return __event_loop(ring, NULL);
20692069+}
20702070+20712071+static int init_ring(struct io_uring *ring, int nr_files)
20722072+{
20732073+ struct io_uring_params params;
20742074+ int ret;
20752075+20762076+ /*
20772077+ * By default, set us up with a big CQ ring. Not strictly needed
20782078+ * here, but it's very important to never overflow the CQ ring.
20792079+ * Events will not be dropped if this happens, but it does slow
20802080+ * the application down in dealing with overflown events.
20812081+ *
20822082+ * Set SINGLE_ISSUER, which tells the kernel that only one thread
20832083+ * is doing IO submissions. This enables certain optimizations in
20842084+ * the kernel.
20852085+ */
20862086+ memset(¶ms, 0, sizeof(params));
20872087+ params.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_CLAMP;
20882088+ params.flags |= IORING_SETUP_CQSIZE;
20892089+ params.cq_entries = 1024;
20902090+20912091+ /*
20922092+ * If use_huge is set, setup the ring with IORING_SETUP_NO_MMAP. This
20932093+ * means that the application allocates the memory for the ring, and
20942094+ * the kernel maps it. The alternative is having the kernel allocate
20952095+ * the memory, and then liburing will mmap it. But we can't really
20962096+ * support huge pages that way. If this fails, then ensure that the
20972097+ * system has huge pages set aside upfront.
20982098+ */
20992099+ if (use_huge)
21002100+ params.flags |= IORING_SETUP_NO_MMAP;
21012101+21022102+ /*
21032103+ * DEFER_TASKRUN decouples async event reaping and retrying from
21042104+ * regular system calls. If this isn't set, then io_uring uses
21052105+ * normal task_work for this. task_work is always being run on any
21062106+ * exit to userspace. Real applications do more than just call IO
21072107+ * related system calls, and hence we can be running this work way
21082108+ * too often. Using DEFER_TASKRUN defers any task_work running to
21092109+ * when the application enters the kernel anyway to wait on new
21102110+ * events. It's generally the preferred and recommended way to setup
21112111+ * a ring.
21122112+ */
21132113+ if (defer_tw) {
21142114+ params.flags |= IORING_SETUP_DEFER_TASKRUN;
21152115+ sqpoll = 0;
21162116+ }
21172117+21182118+ /*
21192119+ * SQPOLL offloads any request submission and retry operations to a
21202120+ * dedicated thread. This enables an application to do IO without
21212121+ * ever having to enter the kernel itself. The SQPOLL thread will
21222122+ * stay busy as long as there's work to do, and go to sleep if
21232123+ * sq_thread_idle msecs have passed. If it's running, submitting new
21242124+ * IO just needs to make them visible to the SQPOLL thread, it needs
21252125+ * not enter the kernel. For submission, the application will only
21262126+ * enter the kernel if the SQPOLL has been idle long enough that it
21272127+ * has gone to sleep.
21282128+ *
21292129+ * Waiting on events still need to enter the kernel, if none are
21302130+ * available. The application may also use io_uring_peek_cqe() to
21312131+ * check for new events without entering the kernel, as completions
21322132+ * will be continually produced to the CQ ring by the SQPOLL thread
21332133+ * as they occur.
21342134+ */
21352135+ if (sqpoll) {
21362136+ params.flags |= IORING_SETUP_SQPOLL;
21372137+ params.sq_thread_idle = 1000;
21382138+ defer_tw = 0;
21392139+ }
21402140+21412141+ /*
21422142+ * If neither DEFER_TASKRUN or SQPOLL is used, set COOP_TASKRUN. This
21432143+ * avoids heavy signal based notifications, which can force an
21442144+ * application to enter the kernel and process it as soon as they
21452145+ * occur.
21462146+ */
21472147+ if (!sqpoll && !defer_tw)
21482148+ params.flags |= IORING_SETUP_COOP_TASKRUN;
21492149+21502150+ /*
21512151+ * The SQ ring size need not be larger than any batch of requests
21522152+ * that need to be prepared before submit. Normally in a loop we'd
21532153+ * only need a few, if any, particularly if multishot is used.
21542154+ */
21552155+ ret = io_uring_queue_init_params(ring_size, ring, ¶ms);
21562156+ if (ret) {
21572157+ fprintf(stderr, "%s\n", strerror(-ret));
21582158+ return 1;
21592159+ }
21602160+21612161+ /*
21622162+ * If send serialization is available and no option was given to use
21632163+ * it or not, default it to on. If it was turned on and the kernel
21642164+ * doesn't support it, turn it off.
21652165+ */
21662166+ if (params.features & IORING_FEAT_SEND_BUF_SELECT) {
21672167+ if (send_ring == -1)
21682168+ send_ring = 1;
21692169+ } else {
21702170+ if (send_ring == 1) {
21712171+ fprintf(stderr, "Kernel doesn't support ring provided "
21722172+ "buffers for sends, disabled\n");
21732173+ }
21742174+ send_ring = 0;
21752175+ }
21762176+21772177+ if (!send_ring && snd_bundle) {
21782178+ fprintf(stderr, "Can't use send bundle without send_ring\n");
21792179+ snd_bundle = 0;
21802180+ }
21812181+21822182+ if (fixed_files) {
21832183+ /*
21842184+ * If fixed files are used, we need to allocate a fixed file
21852185+ * table upfront where new direct descriptors can be managed.
21862186+ */
21872187+ ret = io_uring_register_files_sparse(ring, nr_files);
21882188+ if (ret) {
21892189+ fprintf(stderr, "file register: %d\n", ret);
21902190+ return 1;
21912191+ }
21922192+21932193+ /*
21942194+ * If fixed files are used, we also register the ring fd. See
21952195+ * comment near io_uring_prep_socket_direct_alloc() further
21962196+ * down. This avoids the fget/fput overhead associated with
21972197+ * the io_uring_enter(2) system call itself, which is used to
21982198+ * submit and wait on events.
21992199+ */
22002200+ ret = io_uring_register_ring_fd(ring);
22012201+ if (ret != 1) {
22022202+ fprintf(stderr, "ring register: %d\n", ret);
22032203+ return 1;
22042204+ }
22052205+ }
22062206+22072207+ if (napi) {
22082208+ struct io_uring_napi n = {
22092209+ .prefer_busy_poll = napi > 1 ? 1 : 0,
22102210+ .busy_poll_to = napi_timeout,
22112211+ };
22122212+22132213+ ret = io_uring_register_napi(ring, &n);
22142214+ if (ret) {
22152215+ fprintf(stderr, "io_uring_register_napi: %d\n", ret);
22162216+ if (ret != -EINVAL)
22172217+ return 1;
22182218+ fprintf(stderr, "NAPI not available, turned off\n");
22192219+ }
22202220+ }
22212221+22222222+ return 0;
22232223+}
22242224+22252225+static void *thread_main(void *data)
22262226+{
22272227+ struct conn *c = data;
22282228+ int ret;
22292229+22302230+ c->flags |= CONN_F_STARTED;
22312231+22322232+ /* we need a max of 4 descriptors for each client */
22332233+ ret = init_ring(&c->ring, 4);
22342234+ if (ret)
22352235+ goto done;
22362236+22372237+ if (setup_buffer_rings(&c->ring, c))
22382238+ goto done;
22392239+22402240+ /*
22412241+ * If we're using fixed files, then we need to wait for the parent
22422242+ * to install the c->in_fd into our direct descriptor table. When
22432243+ * that happens, we'll set things up. If we're not using fixed files,
22442244+ * we can set up the receive or connect now.
22452245+ */
22462246+ if (!fixed_files)
22472247+ open_socket(c);
22482248+22492249+ /* we're ready */
22502250+ pthread_barrier_wait(&c->startup_barrier);
22512251+22522252+ __event_loop(&c->ring, c);
22532253+done:
22542254+ return NULL;
22552255+}
22562256+22572257+static void usage(const char *name)
22582258+{
22592259+ printf("%s:\n", name);
22602260+ printf("\t-m:\t\tUse multishot receive (%d)\n", recv_mshot);
22612261+ printf("\t-d:\t\tUse DEFER_TASKRUN (%d)\n", defer_tw);
22622262+ printf("\t-S:\t\tUse SQPOLL (%d)\n", sqpoll);
22632263+ printf("\t-f:\t\tUse only fixed files (%d)\n", fixed_files);
22642264+ printf("\t-a:\t\tUse huge pages for the ring (%d)\n", use_huge);
22652265+ printf("\t-t:\t\tTimeout for waiting on CQEs (usec) (%d)\n", wait_usec);
22662266+ printf("\t-w:\t\tNumber of CQEs to wait for each loop (%d)\n", wait_batch);
22672267+ printf("\t-B:\t\tUse bi-directional mode (%d)\n", bidi);
22682268+ printf("\t-s:\t\tAct only as a sink (%d)\n", is_sink);
22692269+ printf("\t-q:\t\tRing size to use (%d)\n", ring_size);
22702270+ printf("\t-H:\t\tHost to connect to (%s)\n", host);
22712271+ printf("\t-r:\t\tPort to receive on (%d)\n", receive_port);
22722272+ printf("\t-p:\t\tPort to connect to (%d)\n", send_port);
22732273+ printf("\t-6:\t\tUse IPv6 (%d)\n", ipv6);
22742274+ printf("\t-N:\t\tUse NAPI polling (%d)\n", napi);
22752275+ printf("\t-T:\t\tNAPI timeout (usec) (%d)\n", napi_timeout);
22762276+ printf("\t-b:\t\tSend/receive buf size (%d)\n", buf_size);
22772277+ printf("\t-n:\t\tNumber of provided buffers (pow2) (%d)\n", nr_bufs);
22782278+ printf("\t-u:\t\tUse provided buffers for send (%d)\n", send_ring);
22792279+ printf("\t-C:\t\tUse bundles for send (%d)\n", snd_bundle);
22802280+ printf("\t-z:\t\tUse zerocopy send (%d)\n", snd_zc);
22812281+ printf("\t-c:\t\tUse bundles for recv (%d)\n", snd_bundle);
22822282+ printf("\t-M:\t\tUse sendmsg (%d)\n", snd_msg);
22832283+ printf("\t-M:\t\tUse recvmsg (%d)\n", rcv_msg);
22842284+ printf("\t-x:\t\tShow extended stats (%d)\n", ext_stat);
22852285+ printf("\t-V:\t\tIncrease verbosity (%d)\n", verbose);
22862286+}
22872287+22882288+/*
22892289+ * Options parsing the ring / net setup
22902290+ */
22912291+int main(int argc, char *argv[])
22922292+{
22932293+ struct io_uring ring;
22942294+ struct sigaction sa = { };
22952295+ const char *optstring;
22962296+ int opt, ret, fd;
22972297+22982298+ setlocale(LC_NUMERIC, "en_US");
22992299+23002300+ page_size = sysconf(_SC_PAGESIZE);
23012301+ if (page_size < 0) {
23022302+ perror("sysconf(_SC_PAGESIZE)");
23032303+ return 1;
23042304+ }
23052305+23062306+ pthread_mutex_init(&thread_lock, NULL);
23072307+23082308+ optstring = "m:d:S:s:b:f:H:r:p:n:B:N:T:w:t:M:R:u:c:C:q:a:x:z:6Vh?";
23092309+ while ((opt = getopt(argc, argv, optstring)) != -1) {
23102310+ switch (opt) {
23112311+ case 'm':
23122312+ recv_mshot = !!atoi(optarg);
23132313+ break;
23142314+ case 'S':
23152315+ sqpoll = !!atoi(optarg);
23162316+ break;
23172317+ case 'd':
23182318+ defer_tw = !!atoi(optarg);
23192319+ break;
23202320+ case 'b':
23212321+ buf_size = atoi(optarg);
23222322+ break;
23232323+ case 'n':
23242324+ nr_bufs = atoi(optarg);
23252325+ break;
23262326+ case 'u':
23272327+ send_ring = !!atoi(optarg);
23282328+ break;
23292329+ case 'c':
23302330+ rcv_bundle = !!atoi(optarg);
23312331+ break;
23322332+ case 'C':
23332333+ snd_bundle = !!atoi(optarg);
23342334+ break;
23352335+ case 'w':
23362336+ wait_batch = atoi(optarg);
23372337+ break;
23382338+ case 't':
23392339+ wait_usec = atoi(optarg);
23402340+ break;
23412341+ case 's':
23422342+ is_sink = !!atoi(optarg);
23432343+ break;
23442344+ case 'f':
23452345+ fixed_files = !!atoi(optarg);
23462346+ break;
23472347+ case 'H':
23482348+ host = strdup(optarg);
23492349+ break;
23502350+ case 'r':
23512351+ receive_port = atoi(optarg);
23522352+ break;
23532353+ case 'p':
23542354+ send_port = atoi(optarg);
23552355+ break;
23562356+ case 'B':
23572357+ bidi = !!atoi(optarg);
23582358+ break;
23592359+ case 'N':
23602360+ napi = !!atoi(optarg);
23612361+ break;
23622362+ case 'T':
23632363+ napi_timeout = atoi(optarg);
23642364+ break;
23652365+ case '6':
23662366+ ipv6 = true;
23672367+ break;
23682368+ case 'M':
23692369+ snd_msg = !!atoi(optarg);
23702370+ break;
23712371+ case 'z':
23722372+ snd_zc = !!atoi(optarg);
23732373+ break;
23742374+ case 'R':
23752375+ rcv_msg = !!atoi(optarg);
23762376+ break;
23772377+ case 'q':
23782378+ ring_size = atoi(optarg);
23792379+ break;
23802380+ case 'a':
23812381+ use_huge = !!atoi(optarg);
23822382+ break;
23832383+ case 'x':
23842384+ ext_stat = !!atoi(optarg);
23852385+ break;
23862386+ case 'V':
23872387+ verbose++;
23882388+ break;
23892389+ case 'h':
23902390+ default:
23912391+ usage(argv[0]);
23922392+ return 1;
23932393+ }
23942394+ }
23952395+23962396+ if (bidi && is_sink) {
23972397+ fprintf(stderr, "Can't be both bidi proxy and sink\n");
23982398+ return 1;
23992399+ }
24002400+ if (snd_msg && sqpoll) {
24012401+ fprintf(stderr, "SQPOLL with msg variants disabled\n");
24022402+ snd_msg = 0;
24032403+ }
24042404+ if (rcv_msg && rcv_bundle) {
24052405+ fprintf(stderr, "Can't use bundles with recvmsg\n");
24062406+ rcv_msg = 0;
24072407+ }
24082408+ if (snd_msg && snd_bundle) {
24092409+ fprintf(stderr, "Can't use bundles with sendmsg\n");
24102410+ snd_msg = 0;
24112411+ }
24122412+ if (snd_msg && send_ring) {
24132413+ fprintf(stderr, "Can't use send ring sendmsg\n");
24142414+ snd_msg = 0;
24152415+ }
24162416+ if (snd_zc && (send_ring || snd_bundle)) {
24172417+ fprintf(stderr, "Can't use send zc with bundles or ring\n");
24182418+ send_ring = snd_bundle = 0;
24192419+ }
24202420+ /*
24212421+ * For recvmsg w/multishot, we waste some data at the head of the
24222422+ * packet every time. Adjust the buffer size to account for that,
24232423+ * so we're still handing 'buf_size' actual payload of data.
24242424+ */
24252425+ if (rcv_msg && recv_mshot) {
24262426+ fprintf(stderr, "Adjusted buf size for recvmsg w/multishot\n");
24272427+ buf_size += sizeof(struct io_uring_recvmsg_out);
24282428+ }
24292429+24302430+ br_mask = nr_bufs - 1;
24312431+24322432+ fd = setup_listening_socket(receive_port, ipv6);
24332433+ if (is_sink)
24342434+ send_port = -1;
24352435+24362436+ if (fd == -1)
24372437+ return 1;
24382438+24392439+ atexit(show_stats);
24402440+ sa.sa_handler = sig_int;
24412441+ sa.sa_flags = SA_RESTART;
24422442+ sigaction(SIGINT, &sa, NULL);
24432443+24442444+ ret = init_ring(&ring, MAX_CONNS * 3);
24452445+ if (ret)
24462446+ return ret;
24472447+24482448+ printf("Backend: sqpoll=%d, defer_tw=%d, fixed_files=%d, "
24492449+ "is_sink=%d, buf_size=%d, nr_bufs=%d, host=%s, send_port=%d, "
24502450+ "receive_port=%d, napi=%d, napi_timeout=%d, huge_page=%d\n",
24512451+ sqpoll, defer_tw, fixed_files, is_sink,
24522452+ buf_size, nr_bufs, host, send_port, receive_port,
24532453+ napi, napi_timeout, use_huge);
24542454+ printf(" recv options: recvmsg=%d, recv_mshot=%d, recv_bundle=%d\n",
24552455+ rcv_msg, recv_mshot, rcv_bundle);
24562456+ printf(" send options: sendmsg=%d, send_ring=%d, send_bundle=%d, "
24572457+ "send_zerocopy=%d\n", snd_msg, send_ring, snd_bundle,
24582458+ snd_zc);
24592459+24602460+ return parent_loop(&ring, fd);
24612461+}
+102
vendor/liburing/examples/proxy.h
···11+/* SPDX-License-Identifier: MIT */
22+#ifndef LIBURING_PROXY_H
33+#define LIBURING_PROXY_H
44+55+#include <sys/time.h>
66+77+/*
88+ * Generic opcode agnostic encoding to sqe/cqe->user_data
99+ */
1010+struct userdata {
1111+ union {
1212+ struct {
1313+ uint16_t op_tid; /* 4 bits op, 12 bits tid */
1414+ uint16_t bid;
1515+ uint16_t fd;
1616+ };
1717+ uint64_t val;
1818+ };
1919+};
2020+2121+#define OP_SHIFT (12)
2222+#define TID_MASK ((1U << 12) - 1)
2323+2424+/*
2525+ * Packs the information that we will need at completion time into the
2626+ * sqe->user_data field, which is passed back in the completion in
2727+ * cqe->user_data. Some apps would need more space than this, and in fact
2828+ * I'd love to pack the requested IO size in here, and it's not uncommon to
2929+ * see apps use this field as just a cookie to either index a data structure
3030+ * at completion time, or even just put the pointer to the associated
3131+ * structure into this field.
3232+ */
3333+static inline void __encode_userdata(struct io_uring_sqe *sqe, int tid, int op,
3434+ int bid, int fd)
3535+{
3636+ struct userdata ud = {
3737+ .op_tid = (op << OP_SHIFT) | tid,
3838+ .bid = bid,
3939+ .fd = fd
4040+ };
4141+4242+ io_uring_sqe_set_data64(sqe, ud.val);
4343+}
4444+4545+static inline uint64_t __raw_encode(int tid, int op, int bid, int fd)
4646+{
4747+ struct userdata ud = {
4848+ .op_tid = (op << OP_SHIFT) | tid,
4949+ .bid = bid,
5050+ .fd = fd
5151+ };
5252+5353+ return ud.val;
5454+}
5555+5656+static inline int cqe_to_op(struct io_uring_cqe *cqe)
5757+{
5858+ struct userdata ud = { .val = cqe->user_data };
5959+6060+ return ud.op_tid >> OP_SHIFT;
6161+}
6262+6363+static inline int cqe_to_bid(struct io_uring_cqe *cqe)
6464+{
6565+ struct userdata ud = { .val = cqe->user_data };
6666+6767+ return ud.bid;
6868+}
6969+7070+static inline int cqe_to_fd(struct io_uring_cqe *cqe)
7171+{
7272+ struct userdata ud = { .val = cqe->user_data };
7373+7474+ return ud.fd;
7575+}
7676+7777+static unsigned long long mtime_since(const struct timeval *s,
7878+ const struct timeval *e)
7979+{
8080+ long long sec, usec;
8181+8282+ sec = e->tv_sec - s->tv_sec;
8383+ usec = (e->tv_usec - s->tv_usec);
8484+ if (sec > 0 && usec < 0) {
8585+ sec--;
8686+ usec += 1000000;
8787+ }
8888+8989+ sec *= 1000;
9090+ usec /= 1000;
9191+ return sec + usec;
9292+}
9393+9494+static unsigned long long mtime_since_now(struct timeval *tv)
9595+{
9696+ struct timeval end;
9797+9898+ gettimeofday(&end, NULL);
9999+ return mtime_since(tv, &end);
100100+}
101101+102102+#endif
+73-13
vendor/liburing/examples/send-zerocopy.c
···3939#include <sys/wait.h>
4040#include <sys/mman.h>
4141#include <linux/mman.h>
4242+#include <signal.h>
42434344#include "liburing.h"
4445···5253 int idx;
5354 unsigned long long packets;
5455 unsigned long long bytes;
5656+ unsigned long long dt_ms;
5557 struct sockaddr_storage dst_addr;
5658 int fd;
5759};
···7274static int cfg_payload_len;
7375static int cfg_port = 8000;
7476static int cfg_runtime_ms = 4200;
7777+static bool cfg_rx_poll = false;
75787679static socklen_t cfg_alen;
7780static char *str_addr = NULL;
···8184static struct thread_data threads[MAX_THREADS];
8285static pthread_barrier_t barrier;
83868787+static bool should_stop = false;
8888+8989+static void sigint_handler(__attribute__((__unused__)) int sig)
9090+{
9191+ /* kill if should_stop can't unblock threads fast enough */
9292+ if (should_stop)
9393+ _exit(-1);
9494+ should_stop = true;
9595+}
9696+8497/*
8598 * Implementation of error(3), prints an error message and exits.
8699 */
···119132 if (cfg_cpu == -1)
120133 return;
121134135135+ CPU_ZERO(&mask);
136136+ CPU_SET(cfg_cpu, &mask);
122137 ret = io_uring_register_iowq_aff(ring, 1, &mask);
123138 if (ret)
124139 t_error(1, ret, "unabled to set io-wq affinity\n");
···315330 const int notif_slack = 128;
316331 struct io_uring ring;
317332 struct iovec iov;
318318- uint64_t tstop;
333333+ uint64_t tstart;
319334 int i, fd, ret;
320335 int compl_cqes = 0;
321336 int ring_flags = IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER;
337337+ unsigned loop = 0;
322338323339 if (cfg_defer_taskrun)
324340 ring_flags |= IORING_SETUP_DEFER_TASKRUN;
···355371 if (ret)
356372 t_error(1, ret, "io_uring: buffer registration");
357373374374+ if (cfg_rx_poll) {
375375+ struct io_uring_sqe *sqe;
376376+377377+ sqe = io_uring_get_sqe(&ring);
378378+ io_uring_prep_poll_add(sqe, fd, POLLIN);
379379+380380+ ret = io_uring_submit(&ring);
381381+ if (ret != 1)
382382+ t_error(1, ret, "submit poll");
383383+ }
384384+358385 pthread_barrier_wait(&barrier);
359386360360- tstop = gettimeofday_ms() + cfg_runtime_ms;
387387+ tstart = gettimeofday_ms();
361388 do {
362389 struct io_uring_sqe *sqe;
363390 struct io_uring_cqe *cqe;
···419446 }
420447 io_uring_cqe_seen(&ring, cqe);
421448 }
422422- } while (gettimeofday_ms() < tstop);
449449+ if (should_stop)
450450+ break;
451451+ } while ((++loop % 16 != 0) || gettimeofday_ms() < tstart + cfg_runtime_ms);
452452+453453+ td->dt_ms = gettimeofday_ms() - tstart;
423454424455out_fail:
425456 shutdown(fd, SHUT_RDWR);
···435466 io_uring_queue_exit(&ring);
436467}
437468438438-439469static void *do_test(void *arg)
440470{
441471 struct thread_data *td = arg;
···450480451481static void usage(const char *filepath)
452482{
453453- t_error(1, 0, "Usage: %s [-n<N>] [-z<val>] [-s<payload size>] "
454454- "(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath);
483483+ printf("Usage:\t%s <protocol> <ip-version> -D<addr> [options]\n", filepath);
484484+ printf("\t%s <protocol> <ip-version> -R [options]\n\n", filepath);
485485+486486+ printf(" -4\t\tUse IPv4\n");
487487+ printf(" -6\t\tUse IPv4\n");
488488+ printf(" -D <address>\tDestination address\n");
489489+ printf(" -p <port>\tServer port to listen on/connect to\n");
490490+ printf(" -s <size>\tBytes per request\n");
491491+ printf(" -s <size>\tBytes per request\n");
492492+ printf(" -n <nr>\tNumber of parallel requests\n");
493493+ printf(" -z <mode>\tZerocopy mode, 0 to disable, enabled otherwise\n");
494494+ printf(" -b <mode>\tUse registered buffers\n");
495495+ printf(" -l <mode>\tUse huge pages\n");
496496+ printf(" -d\t\tUse defer taskrun\n");
497497+ printf(" -C <cpu>\tPin to the specified CPU\n");
498498+ printf(" -T <nr>\tNumber of threads to use for sending\n");
499499+ printf(" -R\t\tPlay the server role\n");
500500+ printf(" -t <seconds>\tTime in seconds\n");
455501}
456502457503static void parse_opts(int argc, char **argv)
···463509 int c;
464510 char *daddr = NULL;
465511466466- if (argc <= 1)
512512+ if (argc <= 1) {
467513 usage(argv[0]);
514514+ exit(0);
515515+ }
468516469517 cfg_payload_len = max_payload_len;
470518471471- while ((c = getopt(argc, argv, "46D:p:s:t:n:z:b:l:dC:T:R")) != -1) {
519519+ while ((c = getopt(argc, argv, "46D:p:s:t:n:z:b:l:dC:T:Ry")) != -1) {
472520 switch (c) {
473521 case '4':
474522 if (cfg_family != PF_UNSPEC)
···520568 case 'R':
521569 cfg_rx = 1;
522570 break;
571571+ case 'y':
572572+ cfg_rx_poll = 1;
573573+ break;
523574 }
524575 }
525576···536587537588int main(int argc, char **argv)
538589{
590590+ unsigned long long tsum = 0;
539591 unsigned long long packets = 0, bytes = 0;
540592 struct thread_data *td;
541593 const char *cfg_test;
···577629 if (cfg_rx)
578630 do_setup_rx(cfg_family, cfg_type, 0);
579631632632+ if (!cfg_rx)
633633+ signal(SIGINT, sigint_handler);
634634+580635 for (i = 0; i < cfg_nr_threads; i++)
581636 pthread_create(&threads[i].thread, NULL,
582637 !cfg_rx ? do_test : do_rx, &threads[i]);
···586641 pthread_join(td->thread, &res);
587642 packets += td->packets;
588643 bytes += td->bytes;
644644+ tsum += td->dt_ms;
589645 }
646646+ tsum = tsum / cfg_nr_threads;
590647591591- fprintf(stderr, "packets=%llu (MB=%llu), rps=%llu (MB/s=%llu)\n",
592592- packets, bytes >> 20,
593593- packets / (cfg_runtime_ms / 1000),
594594- (bytes >> 20) / (cfg_runtime_ms / 1000));
595595-648648+ if (!tsum) {
649649+ printf("The run is too short, can't gather stats\n");
650650+ } else {
651651+ printf("packets=%llu (MB=%llu), rps=%llu (MB/s=%llu)\n",
652652+ packets, bytes >> 20,
653653+ packets * 1000 / tsum,
654654+ (bytes >> 20) * 1000 / tsum);
655655+ }
596656 pthread_barrier_destroy(&barrier);
597657 return 0;
598658}
···46464747.SH RETURN VALUE
4848None
4949+.SH NOTES
5050+liburing (or the kernel, for that matter) doesn't care about what buffer ID maps
5151+to what buffer, and in fact when recycling buffers after use, the application is
5252+free to add a different buffer into the same buffer ID location. All that
5353+matters is that the application knows what a given buffer ID in time corresponds
5454+to in terms of virtual memory. There's no liburing or kernel assumption that
5555+these mappings are persistent over time, they can very well be different every
5656+time a given buffer ID is added to the provided buffer ring.
4957.SH SEE ALSO
5058.BR io_uring_register_buf_ring (3),
5159.BR io_uring_buf_ring_mask (3),
+46
vendor/liburing/man/io_uring_buf_ring_available.3
···11+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_buf_ring_available 3 "Jan 11, 2024" "liburing-2.6" "liburing Manual"
66+.SH NAME
77+io_uring_buf_ring_available \- return number of unconsumed provided ring buffer entries
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "int io_uring_buf_ring_available(struct io_uring *" ring ",
1313+.BI " struct io_uring_buf_ring *" br ",
1414+.BI " unsigned short " bgid ");"
1515+.fi
1616+.SH DESCRIPTION
1717+.PP
1818+The
1919+.BR io_uring_buf_ring_available (3)
2020+helper returns the number of unconsumed (by the kernel) entries in the
2121+.IR br
2222+provided buffer group belonging to the io_uring
2323+.IR ring
2424+and identified by the buffer group ID
2525+.IR bgid.
2626+2727+Since the head of the provided buffer ring is only visible to the kernel, it's
2828+impossible to otherwise know how many unconsumed entries exist in the given
2929+provided buffer ring. This function query the kernel to return that number.
3030+3131+.SH NOTES
3232+The returned number of entries reflect the amount of unconsumed entries at the
3333+time that it was queried. If inflight IO exists that may consume provided
3434+buffers from this buffer group, then the returned value is inherently racy.
3535+.SH RETURN VALUE
3636+Returns the number of unconsumed entries on success, which may be 0. In case
3737+of error, may return
3838+.BR -ENOENT
3939+if the specified buffer group doesn't exist, or
4040+.BR -EINVAL
4141+if the buffer group isn't of the correct type, or if the kernel doesn't
4242+support this feature.
4343+.SH SEE ALSO
4444+.BR io_uring_register_buf_ring (3),
4545+.BR io_uring_buf_ring_add (3),
4646+.BR io_uring_buf_ring_cq_advance (3)
+6
vendor/liburing/man/io_uring_buf_ring_init.3
···2323.SH RETURN VALUE
2424None
25252626+.SH NOTES
2727+Unless manual setup is needed, it's recommended to use
2828+.BR io_uring_setup_buf_ring (3)
2929+as it provides a simpler way to setup a provided buffer ring.
3030+.
2631.SH SEE ALSO
2732.BR io_uring_register_buf_ring (3),
3333+.BR io_uring_setup_buf_ring (3),
2834.BR io_uring_buf_ring_add (3)
2935.BR io_uring_buf_ring_advance (3),
3036.BR io_uring_buf_ring_cq_advance (3)
+2-2
vendor/liburing/man/io_uring_check_version.3
···2323The
2424.BR io_uring_check_version (3)
2525function returns
2626-.I true
2626+.I false
2727if the liburing library loaded by the dynamic linker is greater-than
2828or equal-to the
2929.I major
···3535The
3636.BR IO_URING_CHECK_VERSION (3)
3737macro returns
3838-.I 1
3838+.I 0
3939if the liburing library being compiled against is greater-than or equal-to the
4040.I major
4141and
+7-1
vendor/liburing/man/io_uring_cq_has_overflow.3
···1818function informs the application if CQ entries have overflowed and are waiting to be flushed to
1919the CQ ring. For example using
2020.BR io_uring_get_events (3)
2121-.
2121+.SH NOTES
2222+Using this function is only valid if the ring has
2323+.B IORING_FEAT_NODROP
2424+set, as it's checking for a flag set by kernels supporting that feature. For
2525+really old kernels that don't support this feature, if CQE overflow is
2626+experienced the CQEs are lost. If that happens, the CQ ring overflow offset
2727+will get incremented.
2228.SH RETURN VALUE
2329True if there are CQ entries waiting to be flushed to the CQ ring.
2430.SH SEE ALSO
+40
vendor/liburing/man/io_uring_enable_rings.3
···11+.\" Copyright (C) 2023 nick black <dankamongmen@gmail.com>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_enable_rings 3 "July 26, 2024" "liburing-2.7" "liburing Manual"
66+.SH NAME
77+io_uring_enable_rings \- enable a disabled ring
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "int io_uring_enable_rings(struct io_uring *" ring ");"
1313+.fi
1414+.SH DESCRIPTION
1515+.PP
1616+The
1717+.BR io_uring_enable_rings (3)
1818+function enables a ring after having created it with the
1919+.B IORING_SETUP_R_DISABLED
2020+flag to
2121+.BR io_uring_queue_init (3)
2222+2323+It is not possible to submit work to such a ring until this
2424+function has been successfully called.
2525+2626+.SH RETURN VALUE
2727+.BR io_uring_enable_rings (3)
2828+returns 0 on success. It otherwise returns a negative error code.
2929+It does not write to
3030+.BR errno .
3131+3232+.SH ERRORS
3333+.TP
3434+.B EBADFD
3535+The ring was not disabled.
3636+3737+.SH SEE ALSO
3838+.BR io_uring_queue_init (3),
3939+.BR io_uring_register (2),
4040+.BR io_uring_setup (2)
+49-38
vendor/liburing/man/io_uring_enter.2
···302302request has been terminated and no further events will be generated. This mode
303303is available since 5.13.
304304305305+This command works like
306306+an async
307307+.BR poll(2)
308308+and the completion event result is the returned mask of events.
309309+.TP
310310+.B IORING_OP_POLL_REMOVE
311311+Remove or update an existing poll request. If found, the
312312+.I res
313313+field of the
314314+.I "struct io_uring_cqe"
315315+will contain 0. If not found,
316316+.I res
317317+will contain
318318+.B -ENOENT,
319319+or
320320+.B -EALREADY
321321+if the poll request was in the process of completing already.
322322+305323If
306324.B IORING_POLL_UPDATE_EVENTS
307325is set in the SQE
···311329.I user_data
312330field of the original SQE submitted, and this values is passed in the
313331.I addr
314314-field of the SQE. This mode is available since 5.13.
315315-332332+field of the SQE.
316333If
317334.B IORING_POLL_UPDATE_USER_DATA
318335is set in the SQE
···321338.I user_data
322339of an existing poll request based on the value passed in the
323340.I off
324324-field. This mode is available since 5.13.
325325-326326-This command works like
327327-an async
328328-.BR poll(2)
329329-and the completion event result is the returned mask of events. For the
330330-variants that update
331331-.I user_data
332332-or
333333-.I events
334334-, the completion result will be similar to
335335-.B IORING_OP_POLL_REMOVE.
336336-337337-.TP
338338-.B IORING_OP_POLL_REMOVE
339339-Remove an existing poll request. If found, the
340340-.I res
341341-field of the
342342-.I "struct io_uring_cqe"
343343-will contain 0. If not found,
344344-.I res
345345-will contain
346346-.B -ENOENT,
347347-or
348348-.B -EALREADY
349349-if the poll request was in the process of completing already.
341341+field. Updating an existing poll is available since 5.13.
350342351343.TP
352344.B IORING_OP_EPOLL_CTL
···357349for details of the system call.
358350.I fd
359351holds the file descriptor that represents the epoll instance,
360360-.I addr
352352+.I off
361353holds the file descriptor to add, remove or modify,
362354.I len
363355holds the operation (EPOLL_CTL_ADD, EPOLL_CTL_DEL, EPOLL_CTL_MOD) to perform and,
364364-.I off
356356+.I addr
365357holds a pointer to the
366366-.I epoll_events
358358+.I epoll_event
367359structure. Available since 5.6.
368360369361.TP
···494486.B IORING_OP_TIMEOUT
495487This command will register a timeout operation. The
496488.I addr
497497-field must contain a pointer to a struct timespec64 structure,
489489+field must contain a pointer to a struct __kernel_timespec structure,
498490.I len
499499-must contain 1 to signify one timespec64 structure,
491491+must contain 1 to signify one __kernel_timespec structure,
500492.I timeout_flags
501493may contain IORING_TIMEOUT_ABS
502494for an absolute timeout value, or 0 for a relative timeout.
···567559.I addr
568560and return values are same as before.
569561.I addr2
570570-field must contain a pointer to a struct timespec64 structure.
562562+field must contain a pointer to a struct __kernel_timespec structure.
571563.I timeout_flags
572564may also contain IORING_TIMEOUT_ABS, in which case the value given is an
573565absolute one, not a relative one.
···12421234.in
12431235.PP
1244123612371237+.TP
12381238+.B IORING_OP_WAITID
12391239+Issue the equivalent of a
12401240+.BR waitid(2)
12411241+system call.
12421242+.I len
12431243+must contain the idtype being queried/waited for and
12441244+.I fd
12451245+must contain the 'pid' (or id) being waited for.
12461246+.I file_index
12471247+is the 'options' being set (the child state changes to wait for).
12481248+.I addr2
12491249+is a pointer to siginfo_t, if any, being filled in. See also
12501250+.BR waitid(2)
12511251+for the general description of the related system call. Available since 6.5.
12521252+12451253.PP
12461254The
12471255.I flags
···12741282chain. This flag has no effect on previous SQE submissions, nor does it impact
12751283SQEs that are outside of the chain tail. This means that multiple chains can be
12761284executing in parallel, or chains and individual SQEs. Only members inside the
12771277-chain are serialized. A chain of SQEs will be broken, if any request in that
12851285+chain are serialized. A chain of SQEs will be broken if any request in that
12781286chain ends in error. io_uring considers any unexpected result an error. This
12791287means that, eg, a short read will also terminate the remainder of the chain.
12801288If a chain of SQE links is broken, the remaining unstarted part of the chain
···1330133813311339The semantics are chosen to accommodate several use cases. First, when all but
13321340the last request of a normal link without linked timeouts are marked with the
13331333-flag, only one CQE per lin is posted. Additionally, it enables suppression of
13411341+flag, only one CQE per link is posted. Additionally, it enables suppression of
13341342CQEs in cases where the side effects of a successfully executed operation is
13351343enough for userspace to know the state of the system. One such example would
13361344be writing to a synchronisation file.
···15171525.B IORING_FEAT_NODROP
15181526feature, and there are no otherwise available CQEs. This clears the error state
15191527and so with no other changes the next call to
15201520-.BR io_uring_setup (2)
15281528+.BR io_uring_enter (2)
15211529will not have this error. This error should be extremely rare and indicates the
15221522-machine is running critically low on memory and. It may be reasonable for the
15301530+machine is running critically low on memory. It may be reasonable for the
15231531application to terminate running unless it is able to safely handle any CQE
15241532being lost.
15251533.TP
···15401548occur if the application tries to queue more requests than we have room for in
15411549the CQ ring, or if the application attempts to wait for more events without
15421550having reaped the ones already present in the CQ ring.
15511551+.TP
15521552+.B EEXIST
15531553+The thread submitting the work is invalid.
15431554.TP
15441555.B EINVAL
15451556Some bits in the
+1-1
vendor/liburing/man/io_uring_free_buf_ring.3
···46464747.SH RETURN VALUE
4848On success
4949-.BR io_uring_register_free_ring (3)
4949+.BR io_uring_free_buf_ring (3)
5050returns a pointer to the buffe ring. On failure it returns
5151.BR -errno .
5252.SH SEE ALSO
+54
vendor/liburing/man/io_uring_prep_bind.3
···11+.\" Copyright (C) 2024 SUSE LLC
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_bind 3 "Jun 3, 2024" "liburing-2.7" "liburing Manual"
66+.SH NAME
77+io_uring_prep_bind \- prepare a bind request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <sys/socket.h>
1111+.B #include <liburing.h>
1212+.PP
1313+.BI "void io_uring_prep_bind(struct io_uring_sqe *" sqe ","
1414+.BI " int " sockfd ","
1515+.BI " struct sockaddr *" addr ","
1616+.BI " socklen_t " addrlen ");"
1717+.fi
1818+.SH DESCRIPTION
1919+The
2020+.BR io_uring_prep_bind (3)
2121+function prepares a bind request. The submission queue entry
2222+.I sqe
2323+is setup to assign the network address at
2424+.IR addr ,
2525+of length
2626+.IR addrlen ,
2727+to the socket descriptor
2828+.IR sockfd.
2929+3030+This function prepares an async
3131+.BR bind (2)
3232+request. See that man page for details.
3333+3434+.SH RETURN VALUE
3535+None
3636+.SH ERRORS
3737+The CQE
3838+.I res
3939+field will contain the result of the operation. See the related man page for
4040+details on possible values. Note that where synchronous system calls will return
4141+.B -1
4242+on failure and set
4343+.I errno
4444+to the actual error value, io_uring never uses
4545+.IR errno .
4646+Instead it returns the negated
4747+.I errno
4848+directly in the CQE
4949+.I res
5050+field.
5151+.SH SEE ALSO
5252+.BR io_uring_get_sqe (3),
5353+.BR io_uring_submit (3),
5454+.BR bind (2)
+6
vendor/liburing/man/io_uring_prep_cancel.3
···7474.BR io_uring_prep_cancel_fd (3)
7575sets up. Available since 5.19.
7676.TP
7777+.B IORING_ASYNC_CANCEL_FD_FIXED
7878+Set in conjunction with
7979+.B IORING_ASYNC_CANCEL_FD ,
8080+indicating that the file descriptor given is a direct descriptor rather than
8181+a normal file descriptor. Available since 6.0.
8282+.TP
7783.B IORING_ASYNC_CANCEL_ANY
7884Match any request in the ring, regardless of user_data or file descriptor.
7985Can be used to cancel any pending request in the ring. Available since 5.19.
···11+.\" Copyright (C) 2023 Breno Leitao <leitao@debian.org>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_cmd 3 "July 27, 2023" "liburing-2.5" "liburing Manual"
66+.SH NAME
77+io_uring_prep_cmd_sock \- prepare a command request for a socket
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "void io_uring_prep_cmd_sock(struct io_uring_sqe *" sqe ","
1313+.BI " int " cmd_op ","
1414+.BI " int " fd ","
1515+.BI " int " level ","
1616+.BI " int " optname ","
1717+.BI " void " *optval ","
1818+.BI " int " optlen ");"
1919+.fi
2020+.SH DESCRIPTION
2121+.PP
2222+The
2323+.BR io_uring_prep_cmd_sock (3)
2424+function prepares an cmd request for a socket. The submission queue entry
2525+.I sqe
2626+is setup to use the socket file descriptor pointed to by
2727+.I fd
2828+to start an command operation defined by
2929+.I cmd_op.
3030+3131+This is a generic function, and each command has their own individual
3232+.I level, optname, optval
3333+values. The optlen defines the size pointed by
3434+.I optval.
3535+3636+.SH Available commands
3737+3838+.TP
3939+.B SOCKET_URING_OP_SIOCINQ
4040+Returns the amount of queued unread data in the receive buffer.
4141+The socket must not be in LISTEN state, otherwise an error
4242+.B -EINVAL
4343+is returned in the CQE
4444+.I res
4545+field.
4646+The following arguments are not used for this command
4747+.I level, optname, optval
4848+and
4949+.I optlen.
5050+5151+Negative return value means an error.
5252+5353+For more information about this command, please check
5454+.BR unix(7).
5555+5656+5757+.TP
5858+.B SOCKET_URING_OP_SIOCOUTQ
5959+Returns the amount of unsent data in the socket send queue.
6060+The socket must not be in LISTEN state, otherwise an error
6161+.B -EINVAL
6262+is returned in the CQE
6363+.I res.
6464+field.
6565+The following arguments are not used for this command
6666+.I level, optname, optval
6767+and
6868+.I optlen.
6969+7070+Negative return value means an error.
7171+7272+For more information about this command, please check
7373+.BR unix(7).
7474+7575+.TP
7676+.B SOCKET_URING_OP_GETSOCKOPT
7777+Command to get options for the socket referred to by the socket file descriptor
7878+.I fd.
7979+The arguments are similar to the
8080+.BR getsockopt(2)
8181+system call.
8282+8383+The
8484+.BR SOCKET_URING_OP_GETSOCKOPT
8585+command is limited to
8686+.BR SOL_SOCKET
8787+.I level.
8888+8989+Differently from the
9090+.BR getsockopt(2)
9191+system call, the updated
9292+.I optlen
9393+value is returned in the CQE
9494+.I res
9595+field, on success. On failure, the CQE
9696+.I res
9797+contains a negative error number.
9898+9999+.TP
100100+.B SOCKET_URING_OP_SETSOCKOPT
101101+Command to set options for the socket referred to by the socket file descriptor
102102+.I fd.
103103+The arguments are similar to the
104104+.BR setsockopt(2)
105105+system call.
106106+107107+.SH NOTES
108108+The memory block pointed by
109109+.I optval
110110+needs to be valid/live until the CQE returns.
111111+112112+.SH RETURN VALUE
113113+Dependent on the command.
114114+115115+.SH ERRORS
116116+The CQE
117117+.I res
118118+field will contain the result of the operation.
119119+.SH SEE ALSO
120120+.BR io_uring_get_sqe (3),
121121+.BR io_uring_submit (3),
122122+.BR io_uring_register (2),
123123+.BR unix (7)
+18-1
vendor/liburing/man/io_uring_prep_fadvise.3
···1313.BI "void io_uring_prep_fadvise(struct io_uring_sqe *" sqe ","
1414.BI " int " fd ","
1515.BI " __u64 " offset ","
1616-.BI " off_t " len ","
1616+.BI " __u32 " len ","
1717.BI " int " advice ");"
1818+.BI "
1919+.BI "void io_uring_prep_fadvise64(struct io_uring_sqe *" sqe ","
2020+.BI " int " fd ","
2121+.BI " __u64 " offset ","
2222+.BI " __u64 " len ","
2323+.BI " int " advice ");"
1824.fi
1925.SH DESCRIPTION
2026.PP
···3036.I len
3137length in bytes, giving it the advise located in
3238.IR advice .
3939+4040+The
4141+.BR io_uring_prep_fadvise64 (3)
4242+function works like
4343+.BR io_uring_prep_fadvise (3)
4444+except that it takes a 64-bit length rather than just a 32-bit one. Older
4545+kernels may not support the 64-bit length variant. If this variant is attempted
4646+used on a kernel that doesn't support 64-bit lengths, then the request will get
4747+errored with
4848+.B -EINVAL
4949+in the results field of the CQE.
33503451This function prepares an async
3552.BR posix_fadvise (2)
···11+.\" Copyright (C) 2023 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_fixed_fd_install 3 "December 8, 2023" "liburing-2.6" "liburing Manual"
66+.SH NAME
77+io_uring_prep_fixed_fd_install \- prepare fixed file fd installation request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "void io_uring_prep_fixed_fd_install(struct io_uring_sqe *" sqe ","
1313+.BI " int " fd ","
1414+.BI " unsigned int " flags ");"
1515+.fi
1616+.SH DESCRIPTION
1717+.PP
1818+The
1919+.BR io_uring_prep_fixed_fd_install (3)
2020+helper prepares a fixed file descriptor installation. The submission queue entry
2121+.I sqe
2222+is setup to install the direct/fixed file descriptor
2323+.I fd
2424+with the specified
2525+.I flags
2626+file installation flags.
2727+2828+One use case of direct/fixed file descriptors is to turn a regular file
2929+descriptor into a direct one, reducing the overhead of any request that
3030+needs to access this file. This helper provides a way to go the other way,
3131+turning a direct descriptor into a regular file descriptor that can then
3232+subsequently be used by regular system calls that take a normal file descriptor.
3333+This can be handy if no regular file descriptor exists for this direct
3434+descriptor. Either because it was instantiated directly as a fixed descriptor,
3535+or because the regular file was closed with
3636+.BR close (2)
3737+after being turned into a direct descriptor.
3838+3939+Upon successful return of this request, both a normal and fixed file descriptor
4040+exists for the same file. Either one of them may be used to access the file.
4141+Either one of them may be closed without affecting the other one.
4242+4343+.I flags
4444+may be either zero, or set to
4545+.B IORING_FIXED_FD_NO_CLOEXEC
4646+to indicate that the new regular file descriptor should not be closed during
4747+exec. By default,
4848+.B O_CLOEXEC
4949+will be set on the new descriptor otherwise. Setting this field to anything but
5050+those two values will result in the request being failed with
5151+.B -EINVAL
5252+in the CQE
5353+.I res
5454+field.
5555+5656+.SH RETURN VALUE
5757+None
5858+.SH ERRORS
5959+The CQE
6060+.I res
6161+field will contain the result of the operation, which in this case will be the
6262+value of the new regular file descriptor. In case of failure, a negative value
6363+is returned.
6464+.SH SEE ALSO
6565+.BR io_uring_get_sqe (3),
6666+.BR io_uring_submit (3),
6767+.BR io_uring_register_files (3),
6868+.BR io_uring_unregister_files (3),
6969+.BR io_uring_prep_close_direct (3),
7070+.BR io_uring_prep_openat_direct (3)
+48
vendor/liburing/man/io_uring_prep_ftruncate.3
···11+.\" Copyright (C) 2024 Tony Solomonik <tony.solomonik@gmail.com>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_ftruncate 3 "January 23, 2024" "liburing-2.6" "liburing Manual"
66+.SH NAME
77+io_uring_prep_ftruncate \- prepare an ftruncate request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "void io_uring_prep_ftruncate(struct io_uring_sqe *" sqe ","
1313+.BI " int " fd ","
1414+.BI " loff_t " len ");"
1515+.fi
1616+.SH DESCRIPTION
1717+.PP
1818+The
1919+.BR io_uring_prep_ftruncate (3)
2020+function prepares an ftruncate request. The submission queue entry
2121+.I sqe
2222+is setup to use the file descriptor
2323+.I fd
2424+that should get truncated to the length indicated by the
2525+.I len
2626+argument.
2727+2828+.SH RETURN VALUE
2929+None
3030+.SH ERRORS
3131+The CQE
3232+.I res
3333+field will contain the result of the operation. See the related man page for
3434+details on possible values. Note that where synchronous system calls will return
3535+.B -1
3636+on failure and set
3737+.I errno
3838+to the actual error value, io_uring never uses
3939+.IR errno .
4040+Instead it returns the negated
4141+.I errno
4242+directly in the CQE
4343+.I res
4444+field.
4545+.SH SEE ALSO
4646+.BR io_uring_get_sqe (3),
4747+.BR io_uring_submit (3),
4848+.BR ftruncate (2),
+92
vendor/liburing/man/io_uring_prep_futex_wait.3
···11+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_futex_wait 3 "Sep 29, 2023" "liburing-2.5" "liburing Manual"
66+.SH NAME
77+io_uring_prep_futex_wait \- prepare a futex wait request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <linux/futex.h>
1111+.B #include <unistd.h>
1212+.B #include <liburing.h>
1313+.PP
1414+.BI "void io_uring_prep_futex_wait(struct io_uring_sqe *" sqe ","
1515+.BI " uint32_t *" futex ","
1616+.BI " uint64_t " val ","
1717+.BI " uint64_t " mask ","
1818+.BI " uint32_t " futex_flags ","
1919+.BI " unsigned int " flags ");"
2020+.fi
2121+.SH DESCRIPTION
2222+.PP
2323+The
2424+.BR io_uring_prep_futex_wait (3)
2525+function prepares a futex wait request. The submission queue entry
2626+.I sqe
2727+is setup for waiting on a futex at address
2828+.I futex
2929+and which still has the value
3030+.I val
3131+and with
3232+.BR futex2 (2)
3333+flags of
3434+.I futex_flags
3535+and io_uring futex flags of
3636+.I flags .
3737+3838+.I mask
3939+can be set to a specific bitset mask, which will be matched by the waking
4040+side to decide who to wake up. To always get woken, an application may use
4141+.B FUTEX_BITSET_MATCH_ANY .
4242+4343+.I futex_flags
4444+follows the
4545+.BR futex2 (2)
4646+flags, not the
4747+.BR futex (2)
4848+v1 interface flags.
4949+5050+.I flags
5151+are currently unused and hence
5252+.B 0
5353+must be passed.
5454+5555+This function prepares an async
5656+.BR futex (2)
5757+wait request. See that man page for details. Note that the io_uring futex
5858+wait request is similar to the
5959+.B FUTEX_WAIT_BITSET
6060+operation, as
6161+.B FUTEX_WAIT
6262+is a strict subset of that.
6363+6464+.SH RETURN VALUE
6565+None
6666+.SH ERRORS
6767+The CQE
6868+.I res
6969+field will contain the result of the operation. See the related man page for
7070+details on possible values. Note that where synchronous system calls will return
7171+.B -1
7272+on failure and set
7373+.I errno
7474+to the actual error value, io_uring never uses
7575+.IR errno .
7676+Instead it returns the negated
7777+.I errno
7878+directly in the CQE
7979+.I res
8080+field.
8181+.SH NOTES
8282+Unlike the sync futex syscalls that wait on a futex, io_uring does not support
8383+passing in a timeout for the request. Instead, applications are encouraged
8484+to use a linked timeout to abort the futex request at a given time, if desired.
8585+.SH SEE ALSO
8686+.BR io_uring_get_sqe (3),
8787+.BR io_uring_submit (3),
8888+.BR io_uring_prep_futex_waitv (3),
8989+.BR io_uring_prep_futex_wake (3),
9090+.BR io_uring_prep_link_timeout (3),
9191+.BR futex (2)
9292+.BR futex2 (2)
+76
vendor/liburing/man/io_uring_prep_futex_waitv.3
···11+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_futex_waitv 3 "Sep 29, 2023" "liburing-2.5" "liburing Manual"
66+.SH NAME
77+io_uring_prep_futex_waitv \- prepare a futex waitv request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <linux/futex.h>
1111+.B #include <unistd.h>
1212+.B #include <liburing.h>
1313+.PP
1414+.BI "void io_uring_prep_futex_waitv(struct io_uring_sqe *" sqe ","
1515+.BI " struct futex_waitv *" futexv ","
1616+.BI " uint32_t " nr_futex ","
1717+.BI " unsigned int " flags ");"
1818+.fi
1919+.SH DESCRIPTION
2020+.PP
2121+The
2222+.BR io_uring_prep_futex_waitv (3)
2323+function prepares a futex wait request for multiple futexes at the same time.
2424+The submission queue entry
2525+.I sqe
2626+is setup for waiting on all futexes given by
2727+.I futexv
2828+and
2929+.I nr_futex
3030+is the number of futexes in that array.
3131+.I flags
3232+must be set to the io_uring specific futex flags.
3333+3434+Unlike
3535+.BR io_uring_prep_futex_wait (3),
3636+the desired bitset mask and values are passed in
3737+.IR futexv .
3838+3939+.I flags
4040+are currently unused and hence
4141+.B 0
4242+must be passed.
4343+4444+This function prepares an async
4545+.BR futex (2)
4646+waitv request. See that man page for details.
4747+4848+.SH RETURN VALUE
4949+None
5050+.SH ERRORS
5151+The CQE
5252+.I res
5353+field will contain the result of the operation. See the related man page for
5454+details on possible values. Note that where synchronous system calls will return
5555+.B -1
5656+on failure and set
5757+.I errno
5858+to the actual error value, io_uring never uses
5959+.IR errno .
6060+Instead it returns the negated
6161+.I errno
6262+directly in the CQE
6363+.I res
6464+field.
6565+.SH NOTES
6666+Unlike the sync futex syscalls that wait on a futex, io_uring does not support
6767+passing in a timeout for the request. Instead, applications are encouraged
6868+to use a linked timeout to abort the futex request at a given time, if desired.
6969+.SH SEE ALSO
7070+.BR io_uring_get_sqe (3),
7171+.BR io_uring_submit (3),
7272+.BR io_uring_prep_futex_wait (3),
7373+.BR io_uring_prep_futex_wake (3),
7474+.BR io_uring_prep_link_timeout (3),
7575+.BR futex (2)
7676+.BR futex2 (2)
+84
vendor/liburing/man/io_uring_prep_futex_wake.3
···11+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_futex_wake 3 "Sep 29, 2023" "liburing-2.5" "liburing Manual"
66+.SH NAME
77+io_uring_prep_futex_wake \- prepare a futex wake request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <linux/futex.h>
1111+.B #include <unistd.h>
1212+.B #include <liburing.h>
1313+.PP
1414+.BI "void io_uring_prep_futex_wake(struct io_uring_sqe *" sqe ","
1515+.BI " uint32_t *" futex ","
1616+.BI " uint64_t " val ","
1717+.BI " uint64_t " mask ","
1818+.BI " uint32_t " futex_flags ","
1919+.BI " unsigned int " flags ");"
2020+.fi
2121+.SH DESCRIPTION
2222+.PP
2323+The
2424+.BR io_uring_prep_futex_wake (3)
2525+function prepares a futex wake request. The submission queue entry
2626+.I sqe
2727+is setup for waking any waiters on the futex indicated by
2828+.I futex
2929+and at most
3030+.I val
3131+futexes.
3232+.I futex_flags
3333+indicates the
3434+.BR futex2 (2)
3535+modifier flags, and io_uring futex flags of
3636+.I flags .
3737+3838+If a given bitset for who to wake is desired, then that must be set in
3939+.I mask .
4040+Use
4141+.B FUTEX_BITSET_MATCH_ANY
4242+to match any waiter on the given futex.
4343+4444+.I flags
4545+are currently unused and hence
4646+.B 0
4747+must be passed.
4848+4949+This function prepares an async
5050+.BR futex (2)
5151+wake request. See that man page for details. Note that the io_uring futex
5252+wake request is similar to the
5353+.B FUTEX_WAKE_BITSET
5454+operation, as
5555+.B FUTEX_WAKE
5656+is a strict subset of that.
5757+5858+.SH RETURN VALUE
5959+None
6060+.SH ERRORS
6161+The CQE
6262+.I res
6363+field will contain the result of the operation. On success, the value will be
6464+the index into
6565+.I futexv
6666+which received a wakeup. See the related man page for details on possible
6767+values for errors. Note that where synchronous system calls will return
6868+.B -1
6969+on failure and set
7070+.I errno
7171+to the actual error value, io_uring never uses
7272+.IR errno .
7373+Instead it returns the negated
7474+.I errno
7575+directly in the CQE
7676+.I res
7777+field.
7878+.SH SEE ALSO
7979+.BR io_uring_get_sqe (3),
8080+.BR io_uring_submit (3),
8181+.BR io_uring_prep_futex_wait (3),
8282+.BR io_uring_prep_futex_waitv (3),
8383+.BR futex (2)
8484+.BR futex2 (2)
+8-8
vendor/liburing/man/io_uring_prep_link_timeout.3
···1515.fi
1616.SH DESCRIPTION
1717.PP
1818-The
1818+The
1919.BR io_uring_prep_link_timeout (3)
2020-function prepares a timeout request for linked sqes. The submission queue entry
2020+function prepares a timeout request for linked sqes. The submission queue entry
2121.I sqe
2222is setup a timeout specified by
2323.IR ts .
···5858Consider an expired timeout a success in terms of the posted completion.
5959.PP
60606161-It is invalid to create a chain (linked sqes) consisting only of a link timeout
6262-request. If all the requests in the chain are completed before timeout, then the
6363-link timeout request gets cancelled. Upon timeout, all the uncompleted requests
6464-in the chain get cancelled.
6161+It is invalid to create a chain (linked sqes) consisting only of a link timeout
6262+request. If all the requests in the chain are completed before timeout, then the
6363+link timeout request gets canceled. Upon timeout, all the uncompleted requests
6464+in the chain get canceled.
65656666.SH RETURN VALUE
6767None
···7878The specified timeout occurred and triggered the completion event.
7979.TP
8080.B -EINVAL
8181-One of the fields set in the SQE was invalid. For example, two clock sources
8181+One of the fields set in the SQE was invalid. For example, two clock sources
8282where given, or the specified timeout seconds or nanoseconds where < 0.
8383.TP
8484.B -EFAULT
8585io_uring was unable to access the data specified by ts.
8686.TP
8787.B -ECANCELED
8888-The timeout was canceled because all submitted requests were completed successfully
8888+The timeout was canceled because all submitted requests were completed successfully
8989or one of the requests resulted in failure.
90909191
+52
vendor/liburing/man/io_uring_prep_listen.3
···11+.\" Copyright (C) 2024 SUSE LLC.
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_listen 3 "Jun 3, 2024" "liburing-2.7" "liburing Manual"
66+.SH NAME
77+io_uring_prep_listen \- prepare a listen request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <sys/socket.h>
1111+.B #include <liburing.h>
1212+.PP
1313+.BI "void io_uring_prep_listen(struct io_uring_sqe *" sqe ","
1414+.BI " int " sockfd ","
1515+.BI " int" backlog ");"
1616+.fi
1717+.SH DESCRIPTION
1818+The
1919+.BR io_uring_prep_listen (3)
2020+function prepares a listen request. The submission queue entry
2121+.I sqe
2222+is setup to place the socket file descriptor pointed by
2323+.IR sockfd
2424+into a state to accept incoming connections. The parameter
2525+.IR backlog ,
2626+defines the maximum length of the queue of pending connections.
2727+2828+This function prepares an async
2929+.BR listen (2)
3030+request. See that man page for details.
3131+3232+.SH RETURN VALUE
3333+None
3434+.SH ERRORS
3535+The CQE
3636+.I res
3737+field will contain the result of the operation. See the related man page for
3838+details on possible values. Note that where synchronous system calls will return
3939+.B -1
4040+on failure and set
4141+.I errno
4242+to the actual error value, io_uring never uses
4343+.IR errno .
4444+Instead it returns the negated
4545+.I errno
4646+directly in the CQE
4747+.I res
4848+field.
4949+.SH SEE ALSO
5050+.BR io_uring_get_sqe (3),
5151+.BR io_uring_submit (3),
5252+.BR listen (2)
+17-1
vendor/liburing/man/io_uring_prep_madvise.3
···1212.PP
1313.BI "void io_uring_prep_madvise(struct io_uring_sqe *" sqe ","
1414.BI " void *" addr ","
1515-.BI " off_t " len ","
1515+.BI " __u32 " len ","
1616.BI " int " advice ");"
1717+.BI "
1818+.BI "void io_uring_prep_madvise64(struct io_uring_sqe *" sqe ","
1919+.BI " void *" addr ","
2020+.BI " __u64 " len ","
2121+.BI " int " advice ");"
1722.fi
1823.SH DESCRIPTION
1924.PP
···2732.I len
2833length in bytes, giving it the advise located in
2934.IR advice .
3535+3636+The
3737+.BR io_uring_prep_madvise64 (3)
3838+function works like
3939+.BR io_uring_prep_madvise (3)
4040+except that it takes a 64-bit length rather than just a 32-bit one. Older
4141+kernels may not support the 64-bit length variant. If this variant is attempted
4242+used on a kernel that doesn't support 64-bit lengths, then the request will get
4343+errored with
4444+.B -EINVAL
4545+in the results field of the CQE.
30463147This function prepares an async
3248.BR madvise (2)
···3434has triggered, a completion CQE is posted and no more events will be generated
3535by the poll request.
3636.BR io_uring_prep_poll_multishot (3)
3737-behaves identically in terms of events, but it persist across notifications
3737+behaves identically in terms of events, but it persists across notifications
3838and will repeatedly post notifications for the same registration. A CQE
3939posted from a multishot poll request will have
4040.B IORING_CQE_F_MORE
+13-1
vendor/liburing/man/io_uring_prep_poll_update.3
···4040If set, the poll update request will replace the existing events being waited
4141for with the ones specified in the
4242.I poll_mask
4343-argument to the function.
4343+argument to the function. Note that only the lower 16 bits of events can
4444+be updated. This includes things like
4545+.B EPOLLIN
4646+and
4747+.B EPOLLOUT .
4848+Higher order masks/settings are included as internal state, and cannot be
4949+modified. That includes settings like
5050+.B EPOLLONESHOT ,
5151+.B EPOLLEXCLUSIVE ,
5252+and
5353+.B EPOLLET .
5454+If an application wishes to modify these, it must cancel/remove the existing
5555+poll request and arm a new one.
4456.TP
4557.B IORING_POLL_UPDATE_USER_DATA
4658If set, the poll update request will update the existing user_data of the
···11+.\" Copyright (C) 2023 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_read_multishot 3 "September 12, 2023" "liburing-2.5" "liburing Manual"
66+.SH NAME
77+io_uring_prep_read_multishot \- prepare I/O read multishot request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "void io_uring_prep_read_multishot(struct io_uring_sqe *" sqe ","
1313+.BI " int " fd ","
1414+.BI " unsigned " nbytes ","
1515+.BI " __u64 " offset ","
1616+.BI " int " buf_group ");"
1717+.fi
1818+.SH DESCRIPTION
1919+.PP
2020+The
2121+.BR io_uring_prep_read_multishot (3)
2222+helper prepares an IO read multishot request. The submission queue entry
2323+.I sqe
2424+is setup to use the file descriptor
2525+.I fd
2626+to start reading
2727+into a buffer from the provided buffer group with ID
2828+.I buf_group
2929+at the specified
3030+.IR offset .
3131+3232+.I nbytes
3333+must be set to zero, as the size read will be given by the size of the
3434+buffers in the indicated buffer group IO.
3535+3636+On files that are not capable of seeking, the offset must be 0 or -1.
3737+3838+If
3939+.I nbytes
4040+exceeds the size of the buffers in the specified buffer group, or if
4141+.I nbytes
4242+is
4343+.B 0 ,
4444+then the size of the buffer in that group will be used for the transfer.
4545+4646+A multishot read request will repeatedly trigger a completion event
4747+whenever data is available to read from the file. Because of that,
4848+this type of request can only be used with a file type that is pollable.
4949+Examples of that include pipes, tun devices, etc. If used with a regular
5050+file, or a wrong file type in general, the request will fail with
5151+.B -EBADFD
5252+in the CQE
5353+.I res
5454+field.
5555+5656+Since multishot requests repeatedly trigger completion events as data
5757+arrives, it must be used with provided buffers. With provided buffers, the
5858+application provides buffers to io_uring upfront, and then the kernel picks
5959+a buffer from the specified group in
6060+.I buf_group
6161+when the request is ready to transfer data.
6262+6363+A multishot request will persist as long as no errors are encountered
6464+doing handling of the request. For each CQE posted on behalf of this request,
6565+the CQE
6666+.I flags
6767+will have
6868+.B IORING_CQE_F_MORE
6969+set if the application should expect more completions from this request.
7070+If this flag isn't set, then that signifies termination of the multishot
7171+read request.
7272+7373+After the read has been prepared it can be submitted with one of the submit
7474+functions.
7575+7676+.SH RETURN VALUE
7777+None
7878+.SH ERRORS
7979+The CQE
8080+.I res
8181+field will contain the result of the operation. See the related man page for
8282+details on possible values. Note that where synchronous system calls will return
8383+.B -1
8484+on failure and set
8585+.I errno
8686+to the actual error value, io_uring never uses
8787+.IR errno .
8888+Instead it returns the negated
8989+.I errno
9090+directly in the CQE
9191+.I res
9292+field.
9393+.SH SEE ALSO
9494+.BR io_uring_get_sqe (3),
9595+.BR io_uring_prep_read (3),
9696+.BR io_uring_buf_ring_init (3)
9797+.BR io_uring_buf_ring_add (3),
9898+.BR io_uring_submit (3)
+28
vendor/liburing/man/io_uring_prep_recv.3
···8080.BR recvmsg (2)
8181operation. If set, the socket still had data to be read after the operation
8282completed. Both these flags are available since 5.19.
8383+8484+.TP
8585+.B IORING_RECVSEND_BUNDLE
8686+If set and provided buffers are used with
8787+.B IOSQE_BUFFER_SELECT ,
8888+the receive operation will attempt to fill multiple buffers with rather than
8989+just pick a single buffer to fill. To receive multiple buffers in a single
9090+receive, the buffer group ID set in the SQE must be of the ring provided type.
9191+If set, the CQE
9292+.I res
9393+field indicates the total number of bytes received, and the buffer ID returned
9494+in the CQE
9595+.I flags
9696+field indicates the first buffer in the receive operation. The application must
9797+iterate from the indicated initial buffer ID and until all
9898+.I res
9999+bytes have been seen to know which is the last buffer in the receive operation.
100100+The buffer IDs consumed will be contigious from the starting ID, in the order
101101+in which they were added to the buffer ring used. Receiving in bundles can
102102+improve performance when more than one chunk of data is available to receive,
103103+by eliminating redundant round trips through the networking stack. Receive
104104+bundles may be used by both single shot and multishot receive operations. Note
105105+that, internally, bundles rely on the networking stack passing back how much
106106+data is left in the socket after the initial receive. This means that the
107107+initial receive may contain less buffers than what is available, with the
108108+followup receive(s) containing more buffers. Available since 6.10.
83109.P
8411085111.SH RETURN VALUE
···102128.SH SEE ALSO
103129.BR io_uring_get_sqe (3),
104130.BR io_uring_submit (3),
131131+.BR io_uring_buf_ring_init (3),
132132+.BR io_uring_buf_ring_add (3),
105133.BR recv (2)
+2
vendor/liburing/man/io_uring_prep_recvmsg.3
···121121.SH SEE ALSO
122122.BR io_uring_get_sqe (3),
123123.BR io_uring_submit (3),
124124+.BR io_uring_buf_ring_init (3),
125125+.BR io_uring_buf_ring_add (3),
124126.BR recvmsg (2)
+49
vendor/liburing/man/io_uring_prep_send.3
···2222.BI " int " flags ","
2323.BI " const struct sockaddr *" addr ","
2424.BI " socklen_t " addrlen ");"
2525+.PP
2626+.BI "void io_uring_prep_send_bundle(struct io_uring_sqe *" sqe ","
2727+.BI " int " sockfd ","
2828+.BI " size_t " len ","
2929+.BI " int " flags ");"
2530.fi
2631.SH DESCRIPTION
2732.PP
···7378.BR sendto (2)
7479request. See that man page for details.
75808181+Both of the above send variants may be used with provided buffers, where rather
8282+than pass a buffer in directly with the request,
8383+.B IOSQE_BUFFER_SELECT
8484+is set in the SQE
8585+.I flags
8686+field, and additionally a buffer group ID is set in the SQE
8787+.I buf_group
8888+field. By using provided buffers with send requests, the application can
8989+prevent any kind of reordering of the outgoing data which can otherwise
9090+occur if the application has more than one send request inflight for a single
9191+socket. This provides better pipelining of data, where previously the app
9292+needed to manually serialize sends.
9393+9494+The bundle version allows the application to issue a single send request,
9595+with a buffer group ID given in the SQE
9696+.I buf_group
9797+field, which keeps sending from that buffer group until it runs out of buffers.
9898+As with any other request using provided buffers,
9999+.B IOSQE_BUFFER_SELECT
100100+must also be set in the SQE
101101+.I flags
102102+before submission. Currently
103103+.I len
104104+must be given as
105105+.B 0
106106+otherwise the request will be errored with
107107+.B -EINVAL
108108+as the result code. Future versions may allow setting
109109+.I
110110+to limit the transfer size. A single CQE is posted for the send, with the result
111111+being how many bytes were sent, on success. When used with provided buffers,
112112+send or send bundle will contain the starting buffer group ID in the CQE
113113+.I flags
114114+field. The number of bytes sent starts from there, and will be in contigious
115115+buffer IDs after that. Send bundle, and send with provided buffers in general,
116116+are available since kernel 6.10, and can be further identified by checking for
117117+the
118118+.B IORING_FEAT_SEND_BUF_SELECT
119119+flag returned in when using
120120+.BR io_uring_init_queue_params (3)
121121+to setup the ring.
122122+76123.SH RETURN VALUE
77124None
78125.SH ERRORS
···93140.SH SEE ALSO
94141.BR io_uring_get_sqe (3),
95142.BR io_uring_submit (3),
143143+.BR io_uring_buf_ring_init (3),
144144+.BR io_uring_buf_ring_add (3),
96145.BR send (2)
97146.BR sendto (2)
···8686.SH SEE ALSO
8787.BR io_uring_get_sqe (3),
8888.BR io_uring_submit (3),
8989+.BR io_uring_buf_ring_init (3),
9090+.BR io_uring_buf_ring_add (3),
8991.BR sendmsg (2)
+1-1
vendor/liburing/man/io_uring_prep_socket.3
···5757The
5858.BR io_uring_prep_socket_direct_alloc (3)
5959helper works just like
6060-.BR io_uring_prep_socket_alloc (3),
6060+.BR io_uring_prep_socket_direct (3),
6161except it allocates a new direct descriptor rather than pass a free slot in. It
6262is equivalent to using
6363.BR io_uring_prep_socket_direct (3)
+7-6
vendor/liburing/man/io_uring_prep_timeout.3
···44.\"
55.TH io_uring_prep_poll_timeout 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
66.SH NAME
77-io_uring_prep_timeoute \- prepare a timeout request
77+io_uring_prep_timeout \- prepare a timeout request
88.SH SYNOPSIS
99.nf
1010.B #include <liburing.h>
···4545The realtime clock source should be used.
4646.TP
4747.B IORING_TIMEOUT_ETIME_SUCCESS
4848-Consider an expired timeout a success in terms of the posted completion.
4949-Normally a timeout that triggers would return in a
4848+Consider an expired timeout a success in terms of the posted completion. This
4949+means it will not sever dependent links, as a failed request normally would. The
5050+posted CQE result code will still contain
5051.B -ETIME
5151-CQE
5252+in the
5253.I res
5354value.
5455.TP
···5758IORING_CQE_F_MORE is set if more timeouts are expected. The value specified in
5859.I count
5960is the number of repeats. A value of 0 means the timeout is indefinite and can
6060-only be stopped by a removal request.
6161+only be stopped by a removal request. Available since the 6.4 kernel.
6162.PP
6263The timeout completion event will trigger if either the specified timeout
6364has occurred, or the specified number of events to wait for have been posted
···7778.TP
7879.B -EINVAL
7980One of the fields set in the SQE was invalid. For example, two clocksources
8080-where given, or the specified timeout seconds or nanoseconds where < 0.
8181+were given, or the specified timeout seconds or nanoseconds were < 0.
8182.TP
8283.B -EFAULT
8384io_uring was unable to access the data specified by
···44.\"
55.TH io_uring_prep_poll_timeout_update 3 "March 12, 2022" "liburing-2.2" "liburing Manual"
66.SH NAME
77-io_uring_prep_timeoute_update \- prepare a request to update an existing timeout
77+io_uring_prep_timeout_update \- prepare a request to update an existing timeout
88.SH SYNOPSIS
99.nf
1010.B #include <liburing.h>
···7878.TP
7979.B -EINVAL
8080One of the fields set in the SQE was invalid. For example, two clocksources
8181-where given, or the specified timeout seconds or nanoseconds where < 0.
8181+were given, or the specified timeout seconds or nanoseconds were < 0.
8282.TP
8383.B -EFAULT
8484io_uring was unable to access the data specified by
+65
vendor/liburing/man/io_uring_prep_waitid.3
···11+.\" Copyright (C) 2022 Jens Axboe <axboe@kernel.dk>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_prep_waitid 3 "July 14, 2023" "liburing-2.5" "liburing Manual"
66+.SH NAME
77+io_uring_prep_waitid \- prepare a waitid request
88+.SH SYNOPSIS
99+.nf
1010+.B #include <sys/wait.h>
1111+.B #include <liburing.h>
1212+.PP
1313+.BI "void io_uring_prep_waitid(struct io_uring_sqe *" sqe ","
1414+.BI " idtype_t " idtype ","
1515+.BI " id_t " id ","
1616+.BI " siginfo_t *" infop ","
1717+.BI " int " options ","
1818+.BI " unsigned int " flags ");"
1919+.fi
2020+.SH DESCRIPTION
2121+.PP
2222+The
2323+.BR io_uring_prep_waitid (3)
2424+function prepares a waitid request. The submission queue entry
2525+.I sqe
2626+is setup to use the
2727+.I idtype
2828+and
2929+.I id
3030+arguments select the child(ren), and
3131+.I options
3232+to specify the child state changes to wait for. Upon successful
3333+return, it fills
3434+.I infop
3535+with information of the child process, if any.
3636+.I flags
3737+is io_uring specific modifier flags. They are currently unused, and hence
3838+.B 0
3939+should be passed.
4040+4141+This function prepares an async
4242+.BR waitid (2)
4343+request. See that man page for details.
4444+4545+.SH RETURN VALUE
4646+None
4747+.SH ERRORS
4848+The CQE
4949+.I res
5050+field will contain the result of the operation. See the related man page for
5151+details on possible values. Note that where synchronous system calls will return
5252+.B -1
5353+on failure and set
5454+.I errno
5555+to the actual error value, io_uring never uses
5656+.IR errno .
5757+Instead it returns the negated
5858+.I errno
5959+directly in the CQE
6060+.I res
6161+field.
6262+.SH SEE ALSO
6363+.BR io_uring_get_sqe (3),
6464+.BR io_uring_submit (3),
6565+.BR waitid (2)
+48-7
vendor/liburing/man/io_uring_queue_init.3
···1717.BI "int io_uring_queue_init_params(unsigned " entries ","
1818.BI " struct io_uring *" ring ","
1919.BI " struct io_uring_params *" params ");"
2020+.PP
2121+.BI "int io_uring_queue_init_mem(unsigned " entries ","
2222+.BI " struct io_uring *" ring ","
2323+.BI " struct io_uring_params *" params ","
2424+.BI " void *" buf ", size_t " buf_size ");"
2025.fi
2126.SH DESCRIPTION
2227.PP
···3540for the SQ ring. This is adequate for regular file or storage workloads, but
3641may be too small for networked workloads. The SQ ring entries do not impose a
3742limit on the number of in-flight requests that the ring can support, it merely
3838-limits the number that can be submitted to the kernel in one go (batch). if the
4343+limits the number that can be submitted to the kernel in one go (batch). If the
3944CQ ring overflows, e.g. more entries are generated than fits in the ring before
4040-the application can reap them, then the ring enters a CQ ring overflow state.
4141-This is indicated by
4545+the application can reap them, then if the kernel supports
4646+.B IORING_FEAT_NODROP
4747+the ring enters a CQ ring overflow state. Otherwise it drops the CQEs and
4848+increments
4949+.I cq.koverflow
5050+in
5151+.I struct io_uring
5252+with the number of CQEs dropped. The overflow state is indicated by
4253.B IORING_SQ_CQ_OVERFLOW
4354being set in the SQ ring flags. Unless the kernel runs out of available memory,
4455entries are not dropped, but it is a much slower completion path and will slow
···6576will be passed through to the io_uring_setup syscall (see
6677.BR io_uring_setup (2)).
67786868-If the
7979+The
6980.BR io_uring_queue_init_params (3)
7070-variant is used, then the parameters indicated by
8181+and
8282+.BR io_uring_queue_init_mem (3)
8383+variants will pass the parameters indicated by
7184.I params
7272-will be passed straight through to the
8585+straight through to the
7386.BR io_uring_setup (2)
7487system call.
75888989+The
9090+.BR io_uring_queue_init_mem (3)
9191+variant uses the provided
9292+.I buf
9393+with associated size
9494+.I buf_size
9595+as the memory for the ring, using the
9696+.B IORING_SETUP_NO_MMAP
9797+flag to
9898+.BR io_uring_setup (2).
9999+The buffer passed to
100100+.BR io_uring_queue_init_mem (3)
101101+must already be zeroed.
102102+Typically, the caller should allocate a huge page and pass that in to
103103+.BR io_uring_queue_init_mem (3).
104104+Pages allocated by mmap are already zeroed.
105105+.BR io_uring_queue_init_mem (3)
106106+returns the number of bytes used from the provided buffer, so that the app can
107107+reuse the buffer with the returned offset to put more rings in the same huge
108108+page.
109109+76110On success, the resources held by
77111.I ring
78112should be released via a corresponding call to
79113.BR io_uring_queue_exit (3).
80114.SH RETURN VALUE
81115.BR io_uring_queue_init (3)
8282-returns 0 on success and
116116+and
117117+.BR io_uring_queue_init_params (3)
118118+return 0 on success and
119119+.BR -errno
120120+on failure.
121121+122122+.BR io_uring_queue_init_mem (3)
123123+returns the number of bytes used from the provided buffer on success, and
83124.BR -errno
84125on failure.
85126.SH SEE ALSO
···505505pointer to an array of two values, with the values in the array being set to
506506the maximum count of workers per NUMA node. Index 0 holds the bounded worker
507507count, and index 1 holds the unbounded worker count. On successful return, the
508508-passed in array will contain the previous maximum valyes for each type. If the
508508+passed in array will contain the previous maximum values for each type. If the
509509count being passed in is 0, then this command returns the current maximum values
510510and doesn't modify the current setting.
511511.I nr_args
···528528system call.
529529530530.I arg
531531-must be set to an unsigned int pointer to an array of type
532532-.I struct io_uring_rsrc_register
531531+must be set to a pointer to an array of type
532532+.I struct io_uring_rsrc_update
533533of
534534.I nr_args
535535number of entries. The
···570570.B IORING_REGISTER_RING_FDS.
571571572572.I arg
573573-must be set to an unsigned int pointer to an array of type
574574-.I struct io_uring_rsrc_register
573573+must be set to a pointer to an array of type
574574+.I struct io_uring_rsrc_update
575575of
576576.I nr_args
577577number of entries. Only the
···757757.B IORING_REGISTER_RESTRICTIONS
758758was specified, but there were already buffers, files, or restrictions
759759registered.
760760+.TP
761761+.B EEXIST
762762+The thread performing the registration is invalid.
760763.TP
761764.B EFAULT
762765buffer is outside of the process' accessible address space, or
+5
vendor/liburing/man/io_uring_register_buf_ring.3
···133133.BR io_uring_register_buf_ring (3)
134134returns 0. On failure it returns
135135.BR -errno .
136136+.SH NOTES
137137+Unless manual setup is needed, it's recommended to use
138138+.BR io_uring_setup_buf_ring (3)
139139+as it provides a simpler way to setup a provided buffer ring.
136140.SH SEE ALSO
137141.BR io_uring_buf_ring_init (3),
138142.BR io_uring_buf_ring_add (3),
143143+.BR io_uring_setup_buf_ring (3),
139144.BR io_uring_buf_ring_advance (3),
140145.BR io_uring_buf_ring_cq_advance (3)
+8
vendor/liburing/man/io_uring_register_files.3
···6060.I nr_files
6161number of file descriptors. These files must be updated before use, using eg
6262.BR io_uring_register_files_update_tag (3).
6363+Note that if the size of the sparse table exceeds what
6464+.B RLIMIT_NOFILE
6565+allows, then
6666+.BR io_uring_register_files_sparse (3)
6767+will attempt to raise the limit using
6868+.B setrlimit (2)
6969+and retry the operation. If the registration fails after doing that, then an
7070+error will be returned.
6371The sparse variant is available in kernels 5.19 and later.
64726573Registering a file table is a prerequisite for using any request that uses
+40
vendor/liburing/man/io_uring_register_napi.3
···11+.\" Copyright (C) 2022 Stefan Roesch <shr@devkernel.io>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_register_napi 3 "November 16, 2022" "liburing-2.4" "liburing Manual"
66+.SH NAME
77+io_uring_register_napi \- register NAPI busy poll settings
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "int io_uring_register_napi(struct io_uring *" ring ","
1313+.BI " struct io_uring_napi *" napi)
1414+.PP
1515+.fi
1616+.SH DESCRIPTION
1717+.PP
1818+The
1919+.BR io_uring_register_napi (3)
2020+function registers the NAPI settings for subsequent operations. The NAPI
2121+settings are specified in the structure that is passed in the
2222+.I napi
2323+parameter. The structure consists of the napi timeout
2424+.I busy_poll_to
2525+(napi busy poll timeout in us) and
2626+.IR prefer_busy_poll .
2727+2828+Registering a NAPI settings sets the mode when calling the function
2929+napi_busy_loop and corresponds to the SO_PREFER_BUSY_POLL socket
3030+option.
3131+3232+NAPI busy poll can reduce the network roundtrip time.
3333+3434+3535+.SH RETURN VALUE
3636+On success
3737+.BR io_uring_register_napi (3)
3838+return 0. On failure they return
3939+.BR -errno .
4040+It also updates the napi structure with the current values.
+70-13
vendor/liburing/man/io_uring_setup.2
···180180If this flag is specified, and if
181181.IR entries
182182exceeds
183183-.B IORING_MAX_ENTRIES ,
183183+.BR IORING_MAX_ENTRIES ,
184184then
185185.IR entries
186186will be clamped at
187187-.B IORING_MAX_ENTRIES .
187187+.BR IORING_MAX_ENTRIES .
188188If the flag
189189-.BR IORING_SETUP_SQPOLL
189189+.B IORING_SETUP_CQSIZE
190190is set, and if the value of
191191.IR "struct io_uring_params.cq_entries"
192192exceeds
193193-.B IORING_MAX_CQ_ENTRIES ,
193193+.BR IORING_MAX_CQ_ENTRIES ,
194194then it will be clamped at
195195-.B IORING_MAX_CQ_ENTRIES .
195195+.BR IORING_MAX_CQ_ENTRIES .
196196.TP
197197.B IORING_SETUP_ATTACH_WQ
198198This flag should be set in conjunction with
···210210for details on how to enable the ring. Available since 5.10.
211211.TP
212212.B IORING_SETUP_SUBMIT_ALL
213213-Normally io_uring stops submitting a batch of request, if one of these requests
213213+Normally io_uring stops submitting a batch of requests, if one of these requests
214214results in an error. This can cause submission of less than what is expected,
215215if a request ends in error while being submitted. If the ring is created with
216216this flag,
···300300trigger work (for example via any of the CQE waiting functions) or else completions may
301301not be delivered.
302302Available since 6.1.
303303+.TP
304304+.B IORING_SETUP_NO_MMAP
305305+By default, io_uring allocates kernel memory that callers must subsequently
306306+.BR mmap (2).
307307+If this flag is set, io_uring instead uses caller-allocated buffers;
308308+.I p->cq_off.user_addr
309309+must point to the memory for the sq/cq rings, and
310310+.I p->sq_off.user_addr
311311+must point to the memory for the sqes.
312312+Each allocation must be contiguous memory.
313313+Typically, callers should allocate this memory by using
314314+.BR mmap (2)
315315+to allocate a huge page.
316316+If this flag is set, a subsequent attempt to
317317+.BR mmap (2)
318318+the io_uring file descriptor will fail.
319319+Available since 6.5.
320320+.TP
321321+.B IORING_SETUP_REGISTERED_FD_ONLY
322322+If this flag is set, io_uring will register the ring file descriptor, and
323323+return the registered descriptor index, without ever allocating an unregistered
324324+file descriptor. The caller will need to use
325325+.B IORING_REGISTER_USE_REGISTERED_RING
326326+when calling
327327+.BR io_uring_register (2).
328328+This flag only makes sense when used alongside with
329329+.B IORING_SETUP_NO_MMAP,
330330+which also needs to be set.
331331+Available since 6.5.
332332+333333+.TP
334334+.B IORING_SETUP_NO_SQARRAY
335335+If this flag is set, entries in the submission queue will be submitted in order,
336336+wrapping around to the first entry after reaching the end of the queue. In other
337337+words, there will be no more indirection via the array of submission entries,
338338+and the queue will be indexed directly by the submission queue tail and the
339339+range of indexed represented by it modulo queue size. Subsequently, the user
340340+should not map the array of submission queue entries, and the corresponding
341341+offset in
342342+.I struct io_sqring_offsets
343343+will be set to zero. Available since 6.6.
344344+303345.PP
304346If no flags are specified, the io_uring instance is setup for
305347interrupt driven I/O. I/O may be submitted using
···323365.TP
324366.B IORING_FEAT_NODROP
325367If this flag is set, io_uring supports almost never dropping completion events.
326326-If a completion event occurs and the CQ ring is full, the kernel stores
327327-the event internally until such a time that the CQ ring has room for more
328328-entries. If this overflow condition is entered, attempting to submit more
329329-IO will fail with the
368368+A dropped event can only occur if the kernel runs out of memory, in which case
369369+you have worse problems than a lost event. Your application and others will
370370+likely get OOM killed anyway. If a completion event occurs and the CQ ring is
371371+full, the kernel stores the event internally until such a time that the CQ ring
372372+has room for more entries. In earlier kernels, if this overflow condition is
373373+entered, attempting to submit more IO would fail with the
330374.B -EBUSY
331375error value, if it can't flush the overflown events to the CQ ring. If this
332376happens, the application must reap events from the CQ ring and attempt the
···410454can be used for IO commands without needing registration. Available since
411455kernel 5.11.
412456.TP
413413-.B IORING_FEAT_ENTER_EXT_ARG
457457+.B IORING_FEAT_EXT_ARG
414458If this flag is set, then the
415459.BR io_uring_enter (2)
416460system call supports passing in an extended argument instead of just the
···496540 __u32 flags;
497541 __u32 dropped;
498542 __u32 array;
499499- __u32 resv[3];
543543+ __u32 resv1;
544544+ __u64 user_addr;
500545};
501546.EE
502547.in
···592637 __u32 overflow;
593638 __u32 cqes;
594639 __u32 flags;
595595- __u32 resv[3];
640640+ __u32 resv1;
641641+ __u64 user_addr;
596642};
597643.EE
598644.in
···647693was specified, but
648694.I io_uring_params.cq_entries
649695was invalid.
696696+.B IORING_SETUP_REGISTERED_FD_ONLY
697697+was specified, but
698698+.B IORING_SETUP_NO_MMAP
699699+was not.
650700.TP
651701.B EMFILE
652702The per-process limit on the number of open file descriptors has been
···666716.B IORING_SETUP_SQPOLL
667717was specified, but the effective user ID of the caller did not have sufficient
668718privileges.
719719+.TP
720720+.B EPERM
721721+.I /proc/sys/kernel/io_uring_disabled
722722+has the value 2, or it has the value 1 and the calling process does not hold the
723723+.B CAP_SYS_ADMIN
724724+capability or is not a member of
725725+.I /proc/sys/kernel/io_uring_group.
669726.SH SEE ALSO
670727.BR io_uring_register (2),
671728.BR io_uring_enter (2)
+8-2
vendor/liburing/man/io_uring_setup_buf_ring.3
···62626363.SH RETURN VALUE
6464On success
6565-.BR io_uring_register_setup_ring (3)
6666-returns a pointer to the buffe ring. On failure it returns
6565+.BR io_uring_setup_buf_ring (3)
6666+returns a pointer to the buffer ring. On failure it returns
6767.BR NULL
6868and sets
6969.I *ret
7070to -errno.
7171+.SH NOTES
7272+Note that even if the kernel supports this feature, registering a provided
7373+buffer ring may still fail with
7474+.B -EINVAL
7575+if the host is a 32-bit architecture and the memory being passed in resides in
7676+high memory.
7177.SH SEE ALSO
7278.BR io_uring_register_buf_ring (3),
7379.BR io_uring_buf_ring_init (3),
+6-1
vendor/liburing/man/io_uring_submit.3
···2626.SH RETURN VALUE
2727On success
2828.BR io_uring_submit (3)
2929-returns the number of submitted submission queue entries. On failure it returns
2929+returns the number of submitted submission queue entries, if SQPOLL is not used.
3030+If SQPOLL is used, the return value may report a higher number of submitted
3131+entries than actually submitted. If the the user requires accurate information
3232+about how many submission queue entries have been successfully submitted, while
3333+using SQPOLL, the user must fall back to repeatedly submitting a single submission
3434+queue entry. On failure it returns
3035.BR -errno .
3136.SH NOTES
3237For any request that passes in data in a struct, that data must remain
+6
vendor/liburing/man/io_uring_submit_and_wait.3
···2727and prepares the SQE, it can be submitted with
2828.BR io_uring_submit_and_wait (3) .
29293030+Ideally used with a ring setup with
3131+.BR IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN
3232+as that will greatly reduce the number of context switches that an application
3333+will see waiting on multiple requests.
3434+3035.SH RETURN VALUE
3136On success
3237.BR io_uring_submit_and_wait (3)
3338returns the number of submitted submission queue entries. On failure it returns
3439.BR -errno .
3540.SH SEE ALSO
4141+.BR io_uring_queue_init_params (3),
3642.BR io_uring_get_sqe (3),
3743.BR io_uring_submit (3),
3844.BR io_uring_submit_and_wait_timeout (3)
···2828.I ts
2929expires. The completion events are stored in the
3030.I cqe_ptr
3131-array. The
3131+array.
3232+.PP
3333+The
3234.I sigmask
3333-specifies the set of signals to block. The prevailing signal mask is restored
3434-before returning.
3535+specifies the set of signals to block. If set, it is equivalent to atomically
3636+executing the following calls:
3737+.PP
3838+.in +4n
3939+.EX
4040+sigset_t origmask;
35414242+pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
4343+ret = io_uring_submit_and_wait_timeout(ring, cqe, wait_nr, ts, NULL);
4444+pthread_sigmask(SIG_SETMASK, &origmask, NULL);
4545+.EE
4646+.in
4747+.PP
3648After the caller retrieves a submission queue entry (SQE) with
3749.BR io_uring_get_sqe (3)
3850and prepares the SQE, it can be submitted with
3951.BR io_uring_submit_and_wait_timeout (3) .
40525353+Ideally used with a ring setup with
5454+.BR IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN
5555+as that will greatly reduce the number of context switches that an application
5656+will see waiting on multiple requests.
5757+4158.SH RETURN VALUE
4259On success
4360.BR io_uring_submit_and_wait_timeout (3)
···5067.B -ETIME
5168is returned in this case.
5269.SH SEE ALSO
7070+.BR io_uring_queue_init_params (3),
5371.BR io_uring_get_sqe (3),
5472.BR io_uring_submit (3),
5573.BR io_uring_submit_and_wait (3),
+27
vendor/liburing/man/io_uring_unregister_napi.3
···11+.\" Copyright (C) 2022 Stefan Roesch <shr@devkernel.io>
22+.\"
33+.\" SPDX-License-Identifier: LGPL-2.0-or-later
44+.\"
55+.TH io_uring_unregister_napi 3 "November 16, 2022" "liburing-2.4" "liburing Manual"
66+.SH NAME
77+io_uring_unregister_napi \- unregister NAPI busy poll settings
88+.SH SYNOPSIS
99+.nf
1010+.B #include <liburing.h>
1111+.PP
1212+.BI "int io_uring_unregister_napi(struct io_uring *" ring ","
1313+.BI " struct io_uring_napi *" napi)
1414+.PP
1515+.fi
1616+.SH DESCRIPTION
1717+.PP
1818+The
1919+.BR io_uring_unregister_napi (3)
2020+function unregisters the NAPI busy poll settings for subsequent operations.
2121+2222+.SH RETURN VALUE
2323+On success
2424+.BR io_uring_unregister_napi (3)
2525+return 0. On failure they return
2626+.BR -errno .
2727+It also updates the napi structure with the current values.
+6
vendor/liburing/man/io_uring_wait_cqe_nr.3
···3131the application can retrieve the completion with
3232.BR io_uring_wait_cqe (3).
33333434+Ideally used with a ring setup with
3535+.BR IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN
3636+as that will greatly reduce the number of context switches that an application
3737+will see waiting on multiple requests.
3838+3439.SH RETURN VALUE
3540On success
3641.BR io_uring_wait_cqe_nr (3)
···3944The return value indicates the result of waiting for a CQE, and it has no
4045relation to the CQE result itself.
4146.SH SEE ALSO
4747+.BR io_uring_queue_init_params (3),
4248.BR io_uring_submit (3),
4349.BR io_uring_wait_cqes (3)
+15-3
vendor/liburing/man/io_uring_wait_cqes.3
···2525.I ring
2626param, waiting for them if necessary or until the timeout
2727.I ts
2828-expires. The
2828+expires.
2929+.PP
3030+The
2931.I sigmask
3030-specifies the set of signals to block. The prevailing signal mask is restored
3131-before returning.
3232+specifies the set of signals to block. If set, it is equivalent to atomically
3333+executing the following calls:
3434+.PP
3535+.in +4n
3636+.EX
3737+sigset_t origmask;
32383939+pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
4040+ret = io_uring_wait_cqes(ring, cqe, wait_nr, ts, NULL);
4141+pthread_sigmask(SIG_SETMASK, &origmask, NULL);
4242+.EE
4343+.in
4444+.PP
3345The
3446.I cqe_ptr
3547param is filled in on success with the first CQE. Callers of this function
···1010#include "arch/x86/lib.h"
1111#elif defined(__aarch64__)
1212#include "arch/aarch64/lib.h"
1313+#elif defined(__riscv) && __riscv_xlen == 64
1414+#include "arch/riscv64/lib.h"
1315#else
1416/*
1517 * We don't have nolibc support for this arch. Must use libc!
···212212 * Ensure kernel sees the SQE updates before the tail update.
213213 */
214214 if (!(ring->flags & IORING_SETUP_SQPOLL))
215215- IO_URING_WRITE_ONCE(*sq->ktail, tail);
215215+ *sq->ktail = tail;
216216 else
217217 io_uring_smp_store_release(sq->ktail, tail);
218218 }
219219 /*
220220- * This _may_ look problematic, as we're not supposed to be reading
221221- * SQ->head without acquire semantics. When we're in SQPOLL mode, the
222222- * kernel submitter could be updating this right now. For non-SQPOLL,
223223- * task itself does it, and there's no potential race. But even for
224224- * SQPOLL, the load is going to be potentially out-of-date the very
225225- * instant it's done, regardless or whether or not it's done
226226- * atomically. Worst case, we're going to be over-estimating what
227227- * we can submit. The point is, we need to be able to deal with this
228228- * situation regardless of any perceived atomicity.
229229- */
230230- return tail - *sq->khead;
220220+ * This load needs to be atomic, since sq->khead is written concurrently
221221+ * by the kernel, but it doesn't need to be load_acquire, since the
222222+ * kernel doesn't store to the submission queue; it advances khead just
223223+ * to indicate that it's finished reading the submission queue entries
224224+ * so they're available for us to write to.
225225+ */
226226+ return tail - IO_URING_READ_ONCE(*sq->khead);
231227}
232228233229/*
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * Description: test persistence of mmap'ed provided ring buffers. Use a range
44+ * of buffer group IDs that puts us into both the lower end array
55+ * and higher end xarry.
66+ *
77+ */
88+#include <stdio.h>
99+#include <stdlib.h>
1010+#include <unistd.h>
1111+#include <fcntl.h>
1212+#include <string.h>
1313+#include <sys/mman.h>
1414+1515+#include "liburing.h"
1616+#include "helpers.h"
1717+1818+#define BGID_START 60
1919+#define BGID_NR 10
2020+#define ENTRIES 512
2121+2222+int main(int argc, char *argv[])
2323+{
2424+ struct io_uring_buf_ring *br[BGID_NR];
2525+ struct io_uring ring;
2626+ size_t ring_size;
2727+ int ret, i, j;
2828+2929+ if (argc > 1)
3030+ return T_EXIT_SKIP;
3131+3232+ ret = io_uring_queue_init(1, &ring, 0);
3333+ if (ret) {
3434+ fprintf(stderr, "queue init failed %d\n", ret);
3535+ return T_EXIT_FAIL;
3636+ }
3737+3838+ ring_size = ENTRIES * sizeof(struct io_uring_buf);
3939+4040+ for (i = 0; i < BGID_NR; i++) {
4141+ int bgid = BGID_START + i;
4242+ struct io_uring_buf_reg reg = {
4343+ .ring_entries = ENTRIES,
4444+ .bgid = bgid,
4545+ .flags = IOU_PBUF_RING_MMAP,
4646+ };
4747+ off_t off;
4848+4949+ ret = io_uring_register_buf_ring(&ring, ®, 0);
5050+ if (ret) {
5151+ if (ret == -EINVAL)
5252+ return T_EXIT_SKIP;
5353+ fprintf(stderr, "reg buf ring: %d\n", ret);
5454+ return T_EXIT_FAIL;
5555+ }
5656+5757+ off = IORING_OFF_PBUF_RING |
5858+ (unsigned long long) bgid << IORING_OFF_PBUF_SHIFT;
5959+ br[i] = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
6060+ MAP_SHARED | MAP_POPULATE, ring.ring_fd, off);
6161+ if (br[i] == MAP_FAILED) {
6262+ perror("mmap");
6363+ return T_EXIT_FAIL;
6464+ }
6565+ }
6666+6767+ for (i = 0; i < BGID_NR; i++) {
6868+ ret = io_uring_unregister_buf_ring(&ring, BGID_START + i);
6969+ if (ret) {
7070+ fprintf(stderr, "reg buf ring: %d\n", ret);
7171+ return T_EXIT_FAIL;
7272+ }
7373+ }
7474+7575+ for (j = 0; j < 1000; j++) {
7676+ for (i = 0; i < BGID_NR; i++)
7777+ memset(br[i], 0x5a, ring_size);
7878+ usleep(1000);
7979+ }
8080+8181+ io_uring_queue_exit(&ring);
8282+ return T_EXIT_PASS;
8383+}
+57-18
vendor/liburing/test/buf-ring.c
···292292 return cqe->flags >> 16;
293293}
294294295295-static int test_running(int bgid, int entries, int loops)
295295+static int test_running(int bgid, int entries, int loops, int use_mmap)
296296{
297297 int ring_mask = io_uring_buf_ring_mask(entries);
298298 struct io_uring_buf_ring *br;
···303303304304 ret = t_create_ring(1, &ring, 0);
305305 if (ret == T_SETUP_SKIP)
306306- return 0;
306306+ return T_EXIT_SKIP;
307307 else if (ret != T_SETUP_OK)
308308- return 1;
308308+ return T_EXIT_FAIL;
309309310310- br = io_uring_setup_buf_ring(&ring, entries, bgid, 0, &ret);
311311- if (!br) {
312312- /* by now should have checked if this is supported or not */
313313- fprintf(stderr, "Buffer ring register failed %d\n", ret);
314314- return 1;
310310+ if (!use_mmap) {
311311+ br = io_uring_setup_buf_ring(&ring, entries, bgid, 0, &ret);
312312+ if (!br) {
313313+ /* by now should have checked if this is supported or not */
314314+ fprintf(stderr, "Buffer ring register failed %d\n", ret);
315315+ return T_EXIT_FAIL;
316316+ }
317317+ } else {
318318+ struct io_uring_buf_reg reg = {
319319+ .ring_entries = entries,
320320+ .bgid = bgid,
321321+ .flags = IOU_PBUF_RING_MMAP,
322322+ };
323323+ size_t ring_size;
324324+ off_t off;
325325+326326+ ret = io_uring_register_buf_ring(&ring, ®, 0);
327327+ if (ret) {
328328+ if (ret == -EINVAL)
329329+ return T_EXIT_SKIP;
330330+ fprintf(stderr, "mmap ring register failed %d\n", ret);
331331+ return T_EXIT_FAIL;
332332+ }
333333+334334+ off = IORING_OFF_PBUF_RING |
335335+ (unsigned long long) bgid << IORING_OFF_PBUF_SHIFT;
336336+ ring_size = sizeof(struct io_uring_buf) * entries;
337337+ br = mmap(NULL, ring_size, PROT_READ | PROT_WRITE,
338338+ MAP_SHARED | MAP_POPULATE, ring.ring_fd, off);
339339+ if (br == MAP_FAILED) {
340340+ perror("mmap");
341341+ return T_EXIT_FAIL;
342342+ }
315343 }
316344317345 buffers = malloc(sizeof(bool) * entries);
318346 if (!buffers)
319319- return 1;
347347+ return T_EXIT_SKIP;
320348321349 read_fd = open("/dev/zero", O_RDONLY);
322350 if (read_fd < 0)
323323- return 1;
351351+ return T_EXIT_SKIP;
324352325353 for (loop = 0; loop < loops; loop++) {
326354 memset(buffers, 0, sizeof(bool) * entries);
···333361 ret = test_one_read(read_fd, bgid, &ring);
334362 if (ret < 0) {
335363 fprintf(stderr, "bad run %d/%d = %d\n", loop, idx, ret);
336336- return ret;
364364+ return T_EXIT_FAIL;
337365 }
338366 if (buffers[ret]) {
339367 fprintf(stderr, "reused buffer %d/%d = %d!\n", loop, idx, ret);
340340- return 1;
368368+ return T_EXIT_FAIL;
341369 }
342370 if (buffer[0] != 0) {
343371 fprintf(stderr, "unexpected read %d %d/%d = %d!\n",
344372 (int)buffer[0], loop, idx, ret);
345345- return 1;
373373+ return T_EXIT_FAIL;
346374 }
347375 if (buffer[1] != 1) {
348376 fprintf(stderr, "unexpected spilled read %d %d/%d = %d!\n",
349377 (int)buffer[1], loop, idx, ret);
350350- return 1;
378378+ return T_EXIT_FAIL;
351379 }
352380 buffers[ret] = true;
353381 }
354382 ret = test_one_read(read_fd, bgid, &ring);
355383 if (ret != -ENOBUFS) {
356384 fprintf(stderr, "expected enobufs run %d = %d\n", loop, ret);
357357- return 1;
385385+ return T_EXIT_FAIL;
358386 }
359387360388 }
···362390 ret = io_uring_unregister_buf_ring(&ring, bgid);
363391 if (ret) {
364392 fprintf(stderr, "Buffer ring register failed %d\n", ret);
365365- return 1;
393393+ return T_EXIT_FAIL;
366394 }
367395368396 close(read_fd);
369397 io_uring_queue_exit(&ring);
370398 free(buffers);
371371- return 0;
399399+ return T_EXIT_PASS;
372400}
373401374402int main(int argc, char *argv[])
···423451 }
424452425453 for (i = 0; !no_buf_ring && entries[i] != -1; i++) {
426426- ret = test_running(2, entries[i], 3);
454454+ ret = test_running(2, entries[i], 3, 0);
427455 if (ret) {
428456 fprintf(stderr, "test_running(%d) failed\n", entries[i]);
429457 return T_EXIT_FAIL;
430458 }
431459 }
460460+461461+ for (i = 0; !no_buf_ring && entries[i] != -1; i++) {
462462+ ret = test_running(2, entries[i], 3, 1);
463463+ if (ret == T_EXIT_SKIP) {
464464+ break;
465465+ } else if (ret != T_EXIT_PASS) {
466466+ fprintf(stderr, "test_running(%d) mmap failed\n", entries[i]);
467467+ return T_EXIT_FAIL;
468468+ }
469469+ }
470470+432471433472 return T_EXIT_PASS;
434473}
+4
vendor/liburing/test/config
···11# Copy this to config.local, uncomment and define values
22#
33+# NOTE: any files or devices added here will be used by tests that take
44+# a file or device arguments This includes tests that are destructive with
55+# respect to data contents. They may get erased or overwritten as part of tests.
66+#
37# Define tests to exclude from running
48# TEST_EXCLUDE=""
59#
+46-11
vendor/liburing/test/connect.c
···133133 return ret;
134134}
135135136136-static int connect_socket(struct io_uring *ring, int fd, int *code)
136136+static int connect_socket(struct io_uring *ring, int fd, int *code, int async)
137137{
138138 struct sockaddr_in addr;
139139 int ret, res;
···150150 }
151151152152 io_uring_prep_connect(sqe, fd, (struct sockaddr*)&addr, sizeof(addr));
153153+ if (async)
154154+ sqe->flags |= IOSQE_ASYNC;
153155 sqe->user_data = 1;
154156155157 ret = submit_and_wait(ring, &res);
···186188 if (connect_fd == -1)
187189 return -1;
188190189189- ret = connect_socket(ring, connect_fd, &code);
191191+ ret = connect_socket(ring, connect_fd, &code, 0);
190192 if (ret == -1)
191193 goto err;
192194···209211 return -1;
210212}
211213212212-static int test_connect(struct io_uring *ring)
214214+static int test_connect(struct io_uring *ring, int async)
213215{
214216 int accept_fd;
215217 int connect_fd;
···227229 if (connect_fd == -1)
228230 goto err1;
229231230230- ret = connect_socket(ring, connect_fd, &code);
232232+ ret = connect_socket(ring, connect_fd, &code, async);
231233 if (ret == -1)
232234 goto err2;
233235···296298 }
297299298300 // We first connect with one client socket in order to fill the accept queue.
299299- ret = connect_socket(ring, connect_fd[0], &code);
301301+ ret = connect_socket(ring, connect_fd[0], &code, 0);
300302 if (ret == -1 || code != 0) {
301303 fprintf(stderr, "unable to connect\n");
302304 goto err;
···363365 return -1;
364366}
365367366366-int main(int argc, char *argv[])
368368+static int test(int flags)
367369{
368370 struct io_uring ring;
369371 int ret;
370372371371- if (argc > 1)
372372- return T_EXIT_SKIP;
373373-374374- ret = io_uring_queue_init(8, &ring, 0);
373373+ ret = io_uring_queue_init(8, &ring, flags);
375374 if (ret) {
376375 fprintf(stderr, "io_uring_queue_setup() = %d\n", ret);
377376 return T_EXIT_FAIL;
···390389 if (no_connect)
391390 return T_EXIT_SKIP;
392391393393- ret = test_connect(&ring);
392392+ ret = test_connect(&ring, 0);
393393+ if (ret == -1) {
394394+ fprintf(stderr, "test_connect(): failed\n");
395395+ return T_EXIT_FAIL;
396396+ }
397397+398398+ ret = test_connect(&ring, 1);
394399 if (ret == -1) {
395400 fprintf(stderr, "test_connect(): failed\n");
396401 return T_EXIT_FAIL;
···405410 io_uring_queue_exit(&ring);
406411 return T_EXIT_PASS;
407412}
413413+414414+int main(int argc, char *argv[])
415415+{
416416+ int ret;
417417+418418+ if (argc > 1)
419419+ return T_EXIT_SKIP;
420420+421421+ ret = test(0);
422422+ if (ret == -1) {
423423+ fprintf(stderr, "test 0 failed\n");
424424+ return T_EXIT_FAIL;
425425+ }
426426+ if (no_connect)
427427+ return T_EXIT_SKIP;
428428+429429+ ret = test(IORING_SETUP_SQPOLL);
430430+ if (ret == -1) {
431431+ fprintf(stderr, "test SQPOLL failed\n");
432432+ return T_EXIT_FAIL;
433433+ }
434434+435435+ ret = test(IORING_SETUP_SINGLE_ISSUER|IORING_SETUP_DEFER_TASKRUN);
436436+ if (ret == -1) {
437437+ fprintf(stderr, "test DEFER failed\n");
438438+ return T_EXIT_FAIL;
439439+ }
440440+441441+ return T_EXIT_PASS;
442442+}
+60
vendor/liburing/test/coredump.c
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * Description: trigger segfault. A recent 6.4-rc kernel introduced a bug
44+ * via vhost where segfaults for applications using io_uring
55+ * would hang in D state forever upon trying to generate the
66+ * core file. Perform a trivial test where a child process
77+ * generates a NULL pointer dereference and ensure that we don't
88+ * hang.
99+ *
1010+ */
1111+#include <stdio.h>
1212+#include <stdlib.h>
1313+#include <unistd.h>
1414+#include <sys/wait.h>
1515+1616+#include "liburing.h"
1717+#include "helpers.h"
1818+1919+static void test(void)
2020+{
2121+ struct io_uring_sqe *sqe;
2222+ struct io_uring ring;
2323+ int *ptr = NULL;
2424+ int fds[2];
2525+ char r1;
2626+2727+ if (pipe(fds) < 0) {
2828+ perror("pipe");
2929+ exit(0);
3030+ }
3131+3232+ io_uring_queue_init(8, &ring, 0);
3333+3434+ sqe = io_uring_get_sqe(&ring);
3535+ io_uring_prep_read(sqe, fds[0], &r1, sizeof(r1), 0);
3636+ sqe->flags = IOSQE_ASYNC;
3737+ sqe->user_data = 1;
3838+3939+ io_uring_submit(&ring);
4040+ *ptr = 0;
4141+ exit(0);
4242+}
4343+4444+int main(int argc, char *argv[])
4545+{
4646+ pid_t pid;
4747+ int wstat;
4848+4949+ pid = fork();
5050+ if (pid < 0) {
5151+ perror("fork");
5252+ return T_EXIT_SKIP;
5353+ } else if (!pid) {
5454+ test();
5555+ }
5656+5757+ wait(&wstat);
5858+ unlink("core");
5959+ return T_EXIT_PASS;
6060+}
+173
vendor/liburing/test/defer-tw-timeout.c
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * Description: test waiting for more events than what will be posted with
44+ * a timeout with DEFER_TASKRUN. All kernels should time out,
55+ * but a non-buggy kernel will end up with one CQE available
66+ * for reaping. Buggy kernels will not have processed the
77+ * task_work and will have 0 events.
88+ *
99+ */
1010+#include <errno.h>
1111+#include <stdio.h>
1212+#include <unistd.h>
1313+#include <stdlib.h>
1414+#include <string.h>
1515+#include <pthread.h>
1616+1717+#include "liburing.h"
1818+#include "helpers.h"
1919+2020+struct d {
2121+ int fd;
2222+};
2323+2424+static void *thread_fn(void *data)
2525+{
2626+ struct d *d = data;
2727+ int ret;
2828+2929+ usleep(100000);
3030+ ret = write(d->fd, "Hello", 5);
3131+ if (ret < 0)
3232+ perror("write");
3333+ return NULL;
3434+}
3535+3636+static int test_poll(struct io_uring *ring)
3737+{
3838+ struct io_uring_cqe *cqe;
3939+ struct io_uring_sqe *sqe;
4040+ struct __kernel_timespec ts;
4141+ int ret, fds[2], i;
4242+ pthread_t thread;
4343+ char buf[32];
4444+ struct d d;
4545+ void *tret;
4646+4747+ if (pipe(fds) < 0) {
4848+ perror("pipe");
4949+ return 1;
5050+ }
5151+ d.fd = fds[1];
5252+5353+ sqe = io_uring_get_sqe(ring);
5454+ io_uring_prep_read(sqe, fds[0], buf, sizeof(buf), 0);
5555+5656+ pthread_create(&thread, NULL, thread_fn, &d);
5757+5858+ ts.tv_sec = 1;
5959+ ts.tv_nsec = 0;
6060+6161+ ret = io_uring_submit_and_wait_timeout(ring, &cqe, 2, &ts, NULL);
6262+ if (ret != 1) {
6363+ fprintf(stderr, "unexpected wait ret %d\n", ret);
6464+ return T_EXIT_FAIL;
6565+ }
6666+6767+ for (i = 0; i < 2; i++) {
6868+ ret = io_uring_peek_cqe(ring, &cqe);
6969+ if (ret)
7070+ break;
7171+ io_uring_cqe_seen(ring, cqe);
7272+ }
7373+7474+ if (i != 1) {
7575+ fprintf(stderr, "Got %d request, expected 1\n", i);
7676+ return T_EXIT_FAIL;
7777+ }
7878+7979+ pthread_join(thread, &tret);
8080+ return T_EXIT_PASS;
8181+}
8282+8383+static int test_file(struct io_uring *ring, char *__fname)
8484+{
8585+ struct io_uring_cqe *cqe;
8686+ struct io_uring_sqe *sqe;
8787+ struct __kernel_timespec ts;
8888+ char filename[64], *fname;
8989+ int fd, ret, i;
9090+ void *buf;
9191+9292+ if (!__fname) {
9393+ fname = filename;
9494+ sprintf(fname, ".defer-tw-timeout.%d", getpid());
9595+ t_create_file(fname, 128*1024);
9696+ } else {
9797+ fname = __fname;
9898+ }
9999+100100+ fd = open(fname, O_RDONLY | O_DIRECT);
101101+ if (fd < 0) {
102102+ if (errno == EINVAL) {
103103+ if (!__fname)
104104+ unlink(fname);
105105+ return T_EXIT_SKIP;
106106+ }
107107+ perror("open");
108108+ if (!__fname)
109109+ unlink(fname);
110110+ return T_EXIT_FAIL;
111111+ }
112112+113113+ if (!__fname)
114114+ unlink(fname);
115115+116116+ if (posix_memalign(&buf, 4096, 4096)) {
117117+ close(fd);
118118+ return T_EXIT_FAIL;
119119+ }
120120+121121+ sqe = io_uring_get_sqe(ring);
122122+ io_uring_prep_read(sqe, fd, buf, 4096, 0);
123123+124124+ ts.tv_sec = 1;
125125+ ts.tv_nsec = 0;
126126+127127+ ret = io_uring_submit_and_wait_timeout(ring, &cqe, 2, &ts, NULL);
128128+ if (ret != 1) {
129129+ fprintf(stderr, "unexpected wait ret %d\n", ret);
130130+ close(fd);
131131+ return T_EXIT_FAIL;
132132+ }
133133+134134+ for (i = 0; i < 2; i++) {
135135+ ret = io_uring_peek_cqe(ring, &cqe);
136136+ if (ret)
137137+ break;
138138+ io_uring_cqe_seen(ring, cqe);
139139+ }
140140+141141+ if (i != 1) {
142142+ fprintf(stderr, "Got %d request, expected 1\n", i);
143143+ close(fd);
144144+ return T_EXIT_FAIL;
145145+ }
146146+147147+ close(fd);
148148+ return T_EXIT_PASS;
149149+}
150150+151151+int main(int argc, char *argv[])
152152+{
153153+ struct io_uring ring;
154154+ char *fname = NULL;
155155+ int ret;
156156+157157+ ret = io_uring_queue_init(8, &ring, IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN);
158158+ if (ret == -EINVAL)
159159+ return T_EXIT_SKIP;
160160+161161+ if (argc > 1)
162162+ fname = argv[1];
163163+164164+ ret = test_file(&ring, fname);
165165+ if (ret != T_EXIT_PASS)
166166+ return ret;
167167+168168+ ret = test_poll(&ring);
169169+ if (ret != T_EXIT_PASS)
170170+ return ret;
171171+172172+ return T_EXIT_PASS;
173173+}
+3-3
vendor/liburing/test/defer.c
···8888 return 0;
8989}
90909191-static int test_cancelled_userdata(struct io_uring *ring)
9191+static int test_canceled_userdata(struct io_uring *ring)
9292{
9393 struct test_context ctx;
9494 int ret, i, nr = 100;
···276276 }
277277278278279279- ret = test_cancelled_userdata(&poll_ring);
279279+ ret = test_canceled_userdata(&poll_ring);
280280 if (ret) {
281281- printf("test_cancelled_userdata failed\n");
281281+ printf("test_canceled_userdata failed\n");
282282 return ret;
283283 }
284284
+1-1
vendor/liburing/test/eventfd-reg.c
···4343 return T_EXIT_FAIL;
4444 }
45454646- /* Check that registrering again will get -EBUSY */
4646+ /* Check that registering again will get -EBUSY */
4747 ret = io_uring_register_eventfd(&ring, evfd[1]);
4848 if (ret != -EBUSY) {
4949 fprintf(stderr, "unexpected 2nd register: %d\n", ret);
+500
vendor/liburing/test/fd-install.c
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * Description: test installing a direct descriptor into the regular
44+ * file table
55+ *
66+ */
77+#include <errno.h>
88+#include <stdio.h>
99+#include <unistd.h>
1010+#include <stdlib.h>
1111+#include <string.h>
1212+#include <fcntl.h>
1313+1414+#include "liburing.h"
1515+#include "helpers.h"
1616+1717+static int no_fd_install;
1818+1919+/* test that O_CLOEXEC is accepted, and others are not */
2020+static int test_flags(struct io_uring *ring, int async)
2121+{
2222+ struct io_uring_sqe *sqe;
2323+ struct io_uring_cqe *cqe;
2424+ int ret, fds[2], fd;
2525+2626+ if (pipe(fds) < 0) {
2727+ perror("pipe");
2828+ return T_EXIT_FAIL;
2929+ }
3030+3131+ ret = io_uring_register_files(ring, &fds[0], 1);
3232+ if (ret) {
3333+ fprintf(stderr, "failed register files %d\n", ret);
3434+ return T_EXIT_FAIL;
3535+ }
3636+3737+ /* check that setting an invalid flag fails */
3838+ sqe = io_uring_get_sqe(ring);
3939+ io_uring_prep_fixed_fd_install(sqe, 0, 1U << 17);
4040+ io_uring_submit(ring);
4141+4242+ ret = io_uring_wait_cqe(ring, &cqe);
4343+ if (ret) {
4444+ fprintf(stderr, "wait cqe %d\n", ret);
4545+ return T_EXIT_FAIL;
4646+ }
4747+ if (cqe->res != -EINVAL) {
4848+ fprintf(stderr, "unexpected cqe res %d\n", cqe->res);
4949+ return T_EXIT_FAIL;
5050+ }
5151+ io_uring_cqe_seen(ring, cqe);
5252+5353+ /* check that IORING_FIXED_FD_NO_CLOEXEC is accepted */
5454+ sqe = io_uring_get_sqe(ring);
5555+ io_uring_prep_fixed_fd_install(sqe, 0, IORING_FIXED_FD_NO_CLOEXEC);
5656+ if (async)
5757+ sqe->flags |= IOSQE_ASYNC;
5858+ io_uring_submit(ring);
5959+6060+ ret = io_uring_wait_cqe(ring, &cqe);
6161+ if (ret) {
6262+ fprintf(stderr, "wait cqe %d\n", ret);
6363+ return T_EXIT_FAIL;
6464+ }
6565+ if (cqe->res < 0) {
6666+ fprintf(stderr, "unexpected cqe res %d\n", cqe->res);
6767+ return T_EXIT_FAIL;
6868+ }
6969+ fd = cqe->res;
7070+ io_uring_cqe_seen(ring, cqe);
7171+7272+ close(fds[0]);
7373+ close(fds[1]);
7474+ close(fd);
7575+ io_uring_unregister_files(ring);
7676+7777+ return T_EXIT_PASS;
7878+}
7979+8080+static int test_linked(struct io_uring *ring)
8181+{
8282+ struct io_uring_sqe *sqe;
8383+ struct io_uring_cqe *cqe;
8484+ int ret, fds[2], fd, i;
8585+8686+ if (pipe(fds) < 0) {
8787+ perror("pipe");
8888+ return T_EXIT_FAIL;
8989+ }
9090+9191+ ret = io_uring_register_files(ring, &fds[0], 1);
9292+ if (ret) {
9393+ fprintf(stderr, "failed register files %d\n", ret);
9494+ return T_EXIT_FAIL;
9595+ }
9696+9797+ sqe = io_uring_get_sqe(ring);
9898+ io_uring_prep_nop(sqe);
9999+ sqe->flags |= IOSQE_IO_LINK;
100100+ sqe->user_data = 1;
101101+102102+ sqe = io_uring_get_sqe(ring);
103103+ io_uring_prep_fixed_fd_install(sqe, 0, 0);
104104+ sqe->user_data = 2;
105105+106106+ ret = io_uring_submit(ring);
107107+ if (ret != 2) {
108108+ fprintf(stderr, "submit: %d\n", ret);
109109+ return T_EXIT_FAIL;
110110+ }
111111+112112+ fd = -1;
113113+ for (i = 0; i < 2; i++) {
114114+ ret = io_uring_wait_cqe(ring, &cqe);
115115+ if (ret) {
116116+ fprintf(stderr, "wait cqe %d\n", ret);
117117+ return T_EXIT_FAIL;
118118+ }
119119+ if (cqe->res < 0) {
120120+ fprintf(stderr, "unexpected cqe res %d\n", cqe->res);
121121+ return T_EXIT_FAIL;
122122+ }
123123+ if (cqe->user_data == 2)
124124+ fd = cqe->res;
125125+ io_uring_cqe_seen(ring, cqe);
126126+ }
127127+128128+ close(fds[0]);
129129+ close(fds[1]);
130130+ if (fd != -1)
131131+ close(fd);
132132+ io_uring_unregister_files(ring);
133133+ return T_EXIT_PASS;
134134+}
135135+136136+/* test not setting IOSQE_FIXED_FILE */
137137+static int test_not_fixed(struct io_uring *ring)
138138+{
139139+ struct io_uring_sqe *sqe;
140140+ struct io_uring_cqe *cqe;
141141+ int ret, fds[2];
142142+143143+ if (pipe(fds) < 0) {
144144+ perror("pipe");
145145+ return T_EXIT_FAIL;
146146+ }
147147+148148+ ret = io_uring_register_files(ring, &fds[0], 1);
149149+ if (ret) {
150150+ fprintf(stderr, "failed register files %d\n", ret);
151151+ return T_EXIT_FAIL;
152152+ }
153153+154154+ sqe = io_uring_get_sqe(ring);
155155+ io_uring_prep_fixed_fd_install(sqe, 0, 0);
156156+ sqe->flags &= ~IOSQE_FIXED_FILE;
157157+ io_uring_submit(ring);
158158+159159+ ret = io_uring_wait_cqe(ring, &cqe);
160160+ if (ret) {
161161+ fprintf(stderr, "wait cqe %d\n", ret);
162162+ return T_EXIT_FAIL;
163163+ }
164164+ if (cqe->res != -EBADF) {
165165+ fprintf(stderr, "unexpected cqe res %d\n", cqe->res);
166166+ return T_EXIT_FAIL;
167167+ }
168168+169169+ io_uring_cqe_seen(ring, cqe);
170170+171171+ close(fds[0]);
172172+ close(fds[1]);
173173+ io_uring_unregister_files(ring);
174174+175175+ return T_EXIT_PASS;
176176+}
177177+178178+/* test invalid direct descriptor indexes */
179179+static int test_bad_fd(struct io_uring *ring, int some_fd)
180180+{
181181+ struct io_uring_sqe *sqe;
182182+ struct io_uring_cqe *cqe;
183183+ int ret;
184184+185185+ sqe = io_uring_get_sqe(ring);
186186+ io_uring_prep_fixed_fd_install(sqe, some_fd, 0);
187187+ io_uring_submit(ring);
188188+189189+ ret = io_uring_wait_cqe(ring, &cqe);
190190+ if (ret) {
191191+ fprintf(stderr, "wait cqe %d\n", ret);
192192+ return T_EXIT_FAIL;
193193+ }
194194+ if (cqe->res != -EBADF) {
195195+ fprintf(stderr, "unexpected cqe res %d\n", cqe->res);
196196+ return T_EXIT_FAIL;
197197+ }
198198+199199+ io_uring_cqe_seen(ring, cqe);
200200+ return T_EXIT_PASS;
201201+}
202202+203203+/* test basic functionality of shifting a direct descriptor to a normal file */
204204+static int test_working(struct io_uring *ring)
205205+{
206206+ struct io_uring_sqe *sqe;
207207+ struct io_uring_cqe *cqe;
208208+ int ret, fds[2];
209209+ char buf[32];
210210+211211+ if (pipe(fds) < 0) {
212212+ perror("pipe");
213213+ return T_EXIT_FAIL;
214214+ }
215215+216216+ /* register read side */
217217+ ret = io_uring_register_files(ring, &fds[0], 1);
218218+ if (ret) {
219219+ fprintf(stderr, "failed register files %d\n", ret);
220220+ return T_EXIT_FAIL;
221221+ }
222222+223223+ /* close normal descriptor */
224224+ close(fds[0]);
225225+226226+ /* normal read should fail */
227227+ ret = read(fds[0], buf, 1);
228228+ if (ret != -1) {
229229+ fprintf(stderr, "unexpected read ret %d\n", ret);
230230+ return T_EXIT_FAIL;
231231+ }
232232+ if (errno != EBADF) {
233233+ fprintf(stderr, "unexpected read failure %d\n", errno);
234234+ return T_EXIT_FAIL;
235235+ }
236236+237237+ /* verify we can read the data */
238238+ sqe = io_uring_get_sqe(ring);
239239+ io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0);
240240+ sqe->flags |= IOSQE_FIXED_FILE;
241241+ io_uring_submit(ring);
242242+243243+ /* put some data in the pipe */
244244+ ret = write(fds[1], "Hello", 5);
245245+ if (ret < 0) {
246246+ perror("write");
247247+ return T_EXIT_FAIL;
248248+ } else if (ret != 5) {
249249+ fprintf(stderr, "short write %d\n", ret);
250250+ return T_EXIT_FAIL;
251251+ }
252252+253253+ ret = io_uring_wait_cqe(ring, &cqe);
254254+ if (ret) {
255255+ fprintf(stderr, "wait cqe %d\n", ret);
256256+ return T_EXIT_FAIL;
257257+ }
258258+ if (cqe->res != 5) {
259259+ fprintf(stderr, "weird pipe read ret %d\n", cqe->res);
260260+ return T_EXIT_FAIL;
261261+ }
262262+ io_uring_cqe_seen(ring, cqe);
263263+264264+ /* fixed pipe read worked, now re-install as a regular fd */
265265+ sqe = io_uring_get_sqe(ring);
266266+ io_uring_prep_fixed_fd_install(sqe, 0, 0);
267267+ io_uring_submit(ring);
268268+269269+ ret = io_uring_wait_cqe(ring, &cqe);
270270+ if (ret) {
271271+ fprintf(stderr, "wait cqe %d\n", ret);
272272+ return T_EXIT_FAIL;
273273+ }
274274+ if (cqe->res == -EINVAL) {
275275+ no_fd_install = 1;
276276+ return T_EXIT_SKIP;
277277+ }
278278+ if (cqe->res < 0) {
279279+ fprintf(stderr, "failed install fd: %d\n", cqe->res);
280280+ return T_EXIT_FAIL;
281281+ }
282282+ /* stash new pipe read side fd in old spot */
283283+ fds[0] = cqe->res;
284284+ io_uring_cqe_seen(ring, cqe);
285285+286286+ ret = write(fds[1], "Hello", 5);
287287+ if (ret < 0) {
288288+ perror("write");
289289+ return T_EXIT_FAIL;
290290+ } else if (ret != 5) {
291291+ fprintf(stderr, "short write %d\n", ret);
292292+ return T_EXIT_FAIL;
293293+ }
294294+295295+ /* normal pipe read should now work with new fd */
296296+ ret = read(fds[0], buf, sizeof(buf));
297297+ if (ret != 5) {
298298+ fprintf(stderr, "unexpected read ret %d\n", ret);
299299+ return T_EXIT_FAIL;
300300+ }
301301+302302+ /* close fixed file */
303303+ sqe = io_uring_get_sqe(ring);
304304+ io_uring_prep_close_direct(sqe, 0);
305305+ io_uring_submit(ring);
306306+307307+ ret = io_uring_wait_cqe(ring, &cqe);
308308+ if (ret) {
309309+ fprintf(stderr, "wait cqe %d\n", ret);
310310+ return T_EXIT_FAIL;
311311+ }
312312+ if (cqe->res) {
313313+ fprintf(stderr, "close fixed fd %d\n", cqe->res);
314314+ return T_EXIT_FAIL;
315315+ }
316316+ io_uring_cqe_seen(ring, cqe);
317317+318318+ ret = write(fds[1], "Hello", 5);
319319+ if (ret < 0) {
320320+ perror("write");
321321+ return T_EXIT_FAIL;
322322+ } else if (ret != 5) {
323323+ fprintf(stderr, "short write %d\n", ret);
324324+ return T_EXIT_FAIL;
325325+ }
326326+327327+ /* normal pipe read should still work with new fd */
328328+ ret = read(fds[0], buf, sizeof(buf));
329329+ if (ret != 5) {
330330+ fprintf(stderr, "unexpected read ret %d\n", ret);
331331+ return T_EXIT_FAIL;
332332+ }
333333+334334+ /* fixed fd pipe read should now fail */
335335+ sqe = io_uring_get_sqe(ring);
336336+ io_uring_prep_read(sqe, 0, buf, sizeof(buf), 0);
337337+ sqe->flags = IOSQE_FIXED_FILE;
338338+ io_uring_submit(ring);
339339+340340+ /* put some data in the pipe */
341341+ ret = write(fds[1], "Hello", 5);
342342+ if (ret < 0) {
343343+ perror("write");
344344+ return T_EXIT_FAIL;
345345+ } else if (ret != 5) {
346346+ fprintf(stderr, "short write %d\n", ret);
347347+ return T_EXIT_FAIL;
348348+ }
349349+350350+ ret = io_uring_wait_cqe(ring, &cqe);
351351+ if (ret) {
352352+ fprintf(stderr, "wait cqe %d\n", ret);
353353+ return T_EXIT_FAIL;
354354+ }
355355+ if (cqe->res != -EBADF) {
356356+ fprintf(stderr, "weird pipe read ret %d\n", cqe->res);
357357+ return T_EXIT_FAIL;
358358+ }
359359+ io_uring_cqe_seen(ring, cqe);
360360+361361+ close(fds[0]);
362362+ close(fds[1]);
363363+ io_uring_unregister_files(ring);
364364+ return T_EXIT_PASS;
365365+}
366366+367367+static int test_creds(struct io_uring *ring, int async)
368368+{
369369+ struct io_uring_sqe *sqe;
370370+ struct io_uring_cqe *cqe;
371371+ int cred_id, ret, fds[2];
372372+373373+ if (pipe(fds) < 0) {
374374+ perror("pipe");
375375+ return T_EXIT_FAIL;
376376+ }
377377+378378+ ret = io_uring_register_files(ring, &fds[0], 1);
379379+ if (ret) {
380380+ fprintf(stderr, "failed register files %d\n", ret);
381381+ return T_EXIT_FAIL;
382382+ }
383383+384384+ cred_id = io_uring_register_personality(ring);
385385+ if (cred_id < 0) {
386386+ fprintf(stderr, "Failed registering creds: %d\n", cred_id);
387387+ return T_EXIT_FAIL;
388388+ }
389389+390390+ /* check that asking for creds fails */
391391+ sqe = io_uring_get_sqe(ring);
392392+ io_uring_prep_fixed_fd_install(sqe, 0, 0);
393393+ if (async)
394394+ sqe->flags |= IOSQE_ASYNC;
395395+ sqe->personality = cred_id;
396396+ io_uring_submit(ring);
397397+398398+ ret = io_uring_wait_cqe(ring, &cqe);
399399+ if (ret) {
400400+ fprintf(stderr, "wait cqe %d\n", ret);
401401+ return T_EXIT_FAIL;
402402+ }
403403+ if (cqe->res > 0) {
404404+ fprintf(stderr, "install succeeded with creds\n");
405405+ return T_EXIT_FAIL;
406406+ }
407407+ if (cqe->res != -EPERM) {
408408+ fprintf(stderr, "unexpected cqe res %d\n", cqe->res);
409409+ return T_EXIT_FAIL;
410410+ }
411411+ io_uring_cqe_seen(ring, cqe);
412412+413413+ close(fds[0]);
414414+ close(fds[1]);
415415+ io_uring_unregister_files(ring);
416416+ io_uring_unregister_personality(ring, cred_id);
417417+ return T_EXIT_PASS;
418418+}
419419+420420+int main(int argc, char *argv[])
421421+{
422422+ struct io_uring ring;
423423+ int ret;
424424+425425+ if (argc > 1)
426426+ return T_EXIT_SKIP;
427427+428428+ ret = io_uring_queue_init(4, &ring, 0);
429429+ if (ret) {
430430+ fprintf(stderr, "ring setup failed: %d\n", ret);
431431+ return T_EXIT_FAIL;
432432+ }
433433+434434+ ret = test_working(&ring);
435435+ if (ret != T_EXIT_PASS) {
436436+ if (ret == T_EXIT_FAIL)
437437+ fprintf(stderr, "test_working failed\n");
438438+ return ret;
439439+ }
440440+ if (no_fd_install)
441441+ return T_EXIT_SKIP;
442442+443443+ ret = test_bad_fd(&ring, 0);
444444+ if (ret != T_EXIT_PASS) {
445445+ if (ret == T_EXIT_FAIL)
446446+ fprintf(stderr, "test_bad_fd 0 failed\n");
447447+ return ret;
448448+ }
449449+450450+ ret = test_bad_fd(&ring, 500);
451451+ if (ret != T_EXIT_PASS) {
452452+ if (ret == T_EXIT_FAIL)
453453+ fprintf(stderr, "test_bad_fd 500 failed\n");
454454+ return ret;
455455+ }
456456+457457+ ret = test_not_fixed(&ring);
458458+ if (ret != T_EXIT_PASS) {
459459+ if (ret == T_EXIT_FAIL)
460460+ fprintf(stderr, "test_not_fixed failed\n");
461461+ return ret;
462462+ }
463463+464464+ ret = test_flags(&ring, 0);
465465+ if (ret != T_EXIT_PASS) {
466466+ if (ret == T_EXIT_FAIL)
467467+ fprintf(stderr, "test_flags 0 failed\n");
468468+ return ret;
469469+ }
470470+471471+ ret = test_flags(&ring, 1);
472472+ if (ret != T_EXIT_PASS) {
473473+ if (ret == T_EXIT_FAIL)
474474+ fprintf(stderr, "test_flags 1 failed\n");
475475+ return ret;
476476+ }
477477+478478+ ret = test_creds(&ring, 0);
479479+ if (ret != T_EXIT_PASS) {
480480+ if (ret == T_EXIT_FAIL)
481481+ fprintf(stderr, "test_creds 0 failed\n");
482482+ return ret;
483483+ }
484484+485485+ ret = test_creds(&ring, 1);
486486+ if (ret != T_EXIT_PASS) {
487487+ if (ret == T_EXIT_FAIL)
488488+ fprintf(stderr, "test_creds 1 failed\n");
489489+ return ret;
490490+ }
491491+492492+ ret = test_linked(&ring);
493493+ if (ret != T_EXIT_PASS) {
494494+ if (ret == T_EXIT_FAIL)
495495+ fprintf(stderr, "test_linked failed\n");
496496+ return ret;
497497+ }
498498+499499+ return T_EXIT_PASS;
500500+}
+7-3
vendor/liburing/test/file-register.c
···305305 files = open_files(100, 100, 0);
306306 ret = io_uring_register_files(ring, files, 200);
307307 if (ret) {
308308- if (ret == -EBADF) {
308308+ if (ret == -EBADF || ret == -EINVAL) {
309309 fprintf(stdout, "Sparse files not supported, skipping\n");
310310 no_update = 1;
311311 goto done;
···352352static int test_basic(struct io_uring *ring, int fail)
353353{
354354 int *files;
355355- int ret;
355355+ int ret, i;
356356 int nr_files = fail ? 10 : 100;
357357358358- files = open_files(nr_files, 0, 0);
358358+ files = open_files(nr_files, fail ? 90 : 0, 0);
359359+ if (fail) {
360360+ for (i = nr_files; i < nr_files + 90; i++)
361361+ files[i] = -2;
362362+ }
359363 ret = io_uring_register_files(ring, files, 100);
360364 if (ret) {
361365 if (fail) {
+1-1
vendor/liburing/test/file-verify.c
···2828#define MAX_VECS 16
29293030/*
3131- * Can be anything, let's just do something for a bit of parallellism
3131+ * Can be anything, let's just do something for a bit of parallelism
3232 */
3333#define READ_BATCH 16
3434
···3636int t_bind_ephemeral_port(int fd, struct sockaddr_in *addr)
3737{
3838 socklen_t addrlen;
3939+ int ret;
39404041 addr->sin_port = 0;
4142 if (bind(fd, (struct sockaddr *)addr, sizeof(*addr)))
4243 return -errno;
43444445 addrlen = sizeof(*addr);
4545- assert(!getsockname(fd, (struct sockaddr *)addr, &addrlen));
4646+ ret = getsockname(fd, (struct sockaddr *)addr, &addrlen);
4747+ assert(!ret);
4648 assert(addr->sin_port != 0);
4749 return 0;
4850}
···284286 * Ensure kernel sees the SQE updates before the tail update.
285287 */
286288 if (!(ring->flags & IORING_SETUP_SQPOLL))
287287- IO_URING_WRITE_ONCE(*sq->ktail, tail);
289289+ *sq->ktail = tail;
288290 else
289291 io_uring_smp_store_release(sq->ktail, tail);
290292 }
291293 /*
292292- * This _may_ look problematic, as we're not supposed to be reading
293293- * SQ->head without acquire semantics. When we're in SQPOLL mode, the
294294- * kernel submitter could be updating this right now. For non-SQPOLL,
295295- * task itself does it, and there's no potential race. But even for
296296- * SQPOLL, the load is going to be potentially out-of-date the very
297297- * instant it's done, regardless or whether or not it's done
298298- * atomically. Worst case, we're going to be over-estimating what
299299- * we can submit. The point is, we need to be able to deal with this
300300- * situation regardless of any perceived atomicity.
301301- */
302302- return tail - *sq->khead;
294294+ * This load needs to be atomic, since sq->khead is written concurrently
295295+ * by the kernel, but it doesn't need to be load_acquire, since the
296296+ * kernel doesn't store to the submission queue; it advances khead just
297297+ * to indicate that it's finished reading the submission queue entries
298298+ * so they're available for us to write to.
299299+ */
300300+ return tail - IO_URING_READ_ONCE(*sq->khead);
303301}
304302305303/*
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * 6.10-rc merge window had a bug where the rewritten mmap support caused
44+ * rings allocated with > 1 page, but asking for smaller mappings, would
55+ * cause -EFAULT to be returned rather than a succesful map. This hit
66+ * applications either using an ancient liburing with IORING_FEAT_SINGLE_MMAP
77+ * support, or application just ignoring that feature flag and still doing
88+ * 3 mmap operations to map the ring.
99+ */
1010+#include <stdio.h>
1111+#include <stdlib.h>
1212+#include <unistd.h>
1313+1414+#include "../src/syscall.h"
1515+#include "liburing.h"
1616+#include "helpers.h"
1717+1818+#define ENTRIES 128
1919+2020+int main(int argc, char *argv[])
2121+{
2222+ struct io_uring_params p = { };
2323+ void *ptr;
2424+ int fd;
2525+2626+ if (argc > 1)
2727+ return T_EXIT_SKIP;
2828+2929+ fd = __sys_io_uring_setup(ENTRIES, &p);
3030+ if (fd < 0)
3131+ return T_EXIT_SKIP;
3232+3333+ if (!(p.features & IORING_FEAT_SINGLE_MMAP)) {
3434+ close(fd);
3535+ return T_EXIT_SKIP;
3636+ }
3737+3838+ ptr = __sys_mmap(0, ENTRIES * sizeof(unsigned), PROT_READ | PROT_WRITE,
3939+ MAP_SHARED | MAP_POPULATE, fd,
4040+ IORING_OFF_SQ_RING);
4141+ if (!IS_ERR(ptr)) {
4242+ close(fd);
4343+ return T_EXIT_PASS;
4444+ }
4545+4646+ fprintf(stderr, "ring sqe array mmap: %d\n", PTR_ERR(ptr));
4747+ return T_EXIT_FAIL;
4848+}
···3838 * sqe_flags: combination of sqe flags
3939 * multi_sqes: record the user_data/index of all the multishot sqes
4040 * cnt: how many entries there are in multi_sqes
4141- * we can leverage multi_sqes array for cancellation: we randomly pick
4242- * up an entry in multi_sqes when form a cancellation sqe.
4141+ * we can leverage multi_sqes array for cancelation: we randomly pick
4242+ * up an entry in multi_sqes when form a cancelation sqe.
4343 * multi_cap: limitation of number of multishot sqes
4444 */
4545static const unsigned sqe_flags[4] = {
···109109{
110110 __u8 flags = 0;
111111 /*
112112- * drain sqe must be put after multishot sqes cancelled
112112+ * drain sqe must be put after multishot sqes canceled
113113 */
114114 do {
115115 flags = sqe_flags[rand() % 4];
···124124 /*
125125 * avoid below case:
126126 * sqe0(multishot, link)->sqe1(nop, link)->sqe2(nop)->sqe3(cancel_sqe0)
127127- * sqe3 may execute before sqe0 so that sqe0 isn't cancelled
127127+ * sqe3 may execute before sqe0 so that sqe0 isn't canceled
128128 */
129129 if (sqe_op == multi)
130130 flags &= ~IOSQE_IO_LINK;
···233233 }
234234235235 sleep(1);
236236- // TODO: randomize event triggerring order
236236+ // TODO: randomize event triggering order
237237 for (i = 0; i < max_entry; i++) {
238238 if (si[i].op != multi && si[i].op != single)
239239 continue;
···265265 }
266266 }
267267 /*
268268- * for multishot sqes, record them only when it is cancelled
268268+ * for multishot sqes, record them only when it is canceled
269269 */
270270 if ((si[index].op != multi) || (cqe_res[j] == -ECANCELED))
271271 compl_bits |= (1ULL << index);
+42
vendor/liburing/test/no-mmap-inval.c
···11+/* SPDX-License-Identifier: MIT */
22+/*
33+ * Description: test that using SETUP_NO_MMAP with an invalid SQ ring
44+ * address fails.
55+ *
66+ */
77+#include <stdlib.h>
88+#include <sys/types.h>
99+#include <stdio.h>
1010+#include <unistd.h>
1111+1212+#include "liburing.h"
1313+#include "helpers.h"
1414+1515+int main(int argc, char *argv[])
1616+{
1717+ struct io_uring_params p = {
1818+ .sq_entries = 2,
1919+ .cq_entries = 4,
2020+ .flags = IORING_SETUP_NO_MMAP,
2121+ };
2222+ struct io_uring ring;
2323+ void *addr;
2424+ int ret;
2525+2626+ if (argc > 1)
2727+ return T_EXIT_SKIP;
2828+2929+ t_posix_memalign(&addr, sysconf(_SC_PAGESIZE), 8192);
3030+ p.cq_off.user_addr = (unsigned long long) (uintptr_t) addr;
3131+3232+ ret = io_uring_queue_init_params(2, &ring, &p);
3333+ if (ret == -EINVAL) {
3434+ /* kernel doesn't support SETUP_NO_MMAP */
3535+ return T_EXIT_SKIP;
3636+ } else if (ret && (ret != -EFAULT && ret != -ENOMEM)) {
3737+ fprintf(stderr, "Got %d, wanted -EFAULT\n", ret);
3838+ return T_EXIT_FAIL;
3939+ }
4040+4141+ return T_EXIT_PASS;
4242+}