Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: add support for BPF filtering for opcode restrictions

Add support for loading classic BPF programs with io_uring to provide
fine-grained filtering of SQE operations. Unlike
IORING_REGISTER_RESTRICTIONS which only allows bitmap-based allow/deny
of opcodes, BPF filters can inspect request attributes and make dynamic
decisions.

The filter is registered via IORING_REGISTER_BPF_FILTER with a struct
io_uring_bpf:

struct io_uring_bpf_filter {
__u32 opcode; /* io_uring opcode to filter */
__u32 flags;
__u32 filter_len; /* number of BPF instructions */
__u32 resv;
__u64 filter_ptr; /* pointer to BPF filter */
__u64 resv2[5];
};

enum {
IO_URING_BPF_CMD_FILTER = 1,
};

struct io_uring_bpf {
__u16 cmd_type; /* IO_URING_BPF_* values */
__u16 cmd_flags; /* none so far */
__u32 resv;
union {
struct io_uring_bpf_filter filter;
};
};

and the filters get supplied a struct io_uring_bpf_ctx:

struct io_uring_bpf_ctx {
__u64 user_data;
__u8 opcode;
__u8 sqe_flags;
__u8 pdu_size;
__u8 pad[5];
};

where it's possible to filter on opcode and sqe_flags, with pdu_size
indicating how much extra data is being passed in beyond the pad field.
This will used for specific finer grained filtering inside an opcode.
An example of that for sockets is in one of the following patches.
Anything the opcode supports can end up in this struct, populated by
the opcode itself, and hence can be filtered for.

Filters have the following semantics:
- Return 1 to allow the request
- Return 0 to deny the request with -EACCES
- Multiple filters can be stacked per opcode. All filters must
return 1 for the opcode to be allowed.
- Filters are evaluated in registration order (most recent first)

The implementation uses classic BPF (cBPF) rather than eBPF for as
that's required for containers, and since they can be used by any
user in the system.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+447
+9
include/linux/io_uring_types.h
··· 219 219 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 220 220 }; 221 221 222 + struct io_bpf_filter; 223 + struct io_bpf_filters { 224 + refcount_t refs; /* ref for ->bpf_filters */ 225 + spinlock_t lock; /* protects ->bpf_filters modifications */ 226 + struct io_bpf_filter __rcu **filters; 227 + struct rcu_head rcu_head; 228 + }; 229 + 222 230 struct io_restriction { 223 231 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 224 232 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 233 + struct io_bpf_filters *bpf_filters; 225 234 u8 sqe_flags_allowed; 226 235 u8 sqe_flags_required; 227 236 /* IORING_OP_* restrictions exist */
+3
include/uapi/linux/io_uring.h
··· 700 700 /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ 701 701 IORING_REGISTER_ZCRX_CTRL = 36, 702 702 703 + /* register bpf filtering programs */ 704 + IORING_REGISTER_BPF_FILTER = 37, 705 + 703 706 /* this goes last */ 704 707 IORING_REGISTER_LAST, 705 708
+50
include/uapi/linux/io_uring/bpf_filter.h
··· 1 + /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 + /* 3 + * Header file for the io_uring BPF filters. 4 + */ 5 + #ifndef LINUX_IO_URING_BPF_FILTER_H 6 + #define LINUX_IO_URING_BPF_FILTER_H 7 + 8 + #include <linux/types.h> 9 + 10 + /* 11 + * Struct passed to filters. 12 + */ 13 + struct io_uring_bpf_ctx { 14 + __u64 user_data; 15 + __u8 opcode; 16 + __u8 sqe_flags; 17 + __u8 pdu_size; /* size of aux data for filter */ 18 + __u8 pad[5]; 19 + }; 20 + 21 + enum { 22 + /* 23 + * If set, any currently unset opcode will have a deny filter attached 24 + */ 25 + IO_URING_BPF_FILTER_DENY_REST = 1, 26 + }; 27 + 28 + struct io_uring_bpf_filter { 29 + __u32 opcode; /* io_uring opcode to filter */ 30 + __u32 flags; 31 + __u32 filter_len; /* number of BPF instructions */ 32 + __u32 resv; 33 + __u64 filter_ptr; /* pointer to BPF filter */ 34 + __u64 resv2[5]; 35 + }; 36 + 37 + enum { 38 + IO_URING_BPF_CMD_FILTER = 1, 39 + }; 40 + 41 + struct io_uring_bpf { 42 + __u16 cmd_type; /* IO_URING_BPF_* values */ 43 + __u16 cmd_flags; /* none so far */ 44 + __u32 resv; 45 + union { 46 + struct io_uring_bpf_filter filter; 47 + }; 48 + }; 49 + 50 + #endif
+5
io_uring/Kconfig
··· 9 9 depends on PAGE_POOL 10 10 depends on INET 11 11 depends on NET_RX_BUSY_POLL 12 + 13 + config IO_URING_BPF 14 + def_bool y 15 + depends on BPF 16 + depends on NET
+1
io_uring/Makefile
··· 24 24 obj-$(CONFIG_NET) += net.o cmd_net.o 25 25 obj-$(CONFIG_PROC_FS) += fdinfo.o 26 26 obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o 27 + obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
+321
io_uring/bpf_filter.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * BPF filter support for io_uring. Supports SQE opcodes for now. 4 + */ 5 + #include <linux/kernel.h> 6 + #include <linux/errno.h> 7 + #include <linux/io_uring.h> 8 + #include <linux/filter.h> 9 + #include <linux/bpf.h> 10 + #include <uapi/linux/io_uring.h> 11 + 12 + #include "io_uring.h" 13 + #include "bpf_filter.h" 14 + #include "net.h" 15 + 16 + struct io_bpf_filter { 17 + struct bpf_prog *prog; 18 + struct io_bpf_filter *next; 19 + }; 20 + 21 + /* Deny if this is set as the filter */ 22 + static const struct io_bpf_filter dummy_filter; 23 + 24 + static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx, 25 + struct io_kiocb *req) 26 + { 27 + bctx->opcode = req->opcode; 28 + bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS; 29 + bctx->user_data = req->cqe.user_data; 30 + /* clear residual, anything from pdu_size and below */ 31 + memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0, 32 + sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size)); 33 + } 34 + 35 + /* 36 + * Run registered filters for a given opcode. For filters, a return of 0 denies 37 + * execution of the request, a return of 1 allows it. If any filter for an 38 + * opcode returns 0, filter processing is stopped, and the request is denied. 39 + * This also stops the processing of filters. 40 + * 41 + * __io_uring_run_bpf_filters() returns 0 on success, allow running the 42 + * request, and -EACCES when a request is denied. 43 + */ 44 + int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req) 45 + { 46 + struct io_bpf_filter *filter; 47 + struct io_uring_bpf_ctx bpf_ctx; 48 + int ret; 49 + 50 + /* Fast check for existence of filters outside of RCU */ 51 + if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode])) 52 + return 0; 53 + 54 + /* 55 + * req->opcode has already been validated to be within the range 56 + * of what we expect, io_init_req() does this. 57 + */ 58 + guard(rcu)(); 59 + filter = rcu_dereference(res->bpf_filters->filters[req->opcode]); 60 + if (!filter) 61 + return 0; 62 + else if (filter == &dummy_filter) 63 + return -EACCES; 64 + 65 + io_uring_populate_bpf_ctx(&bpf_ctx, req); 66 + 67 + /* 68 + * Iterate registered filters. The opcode is allowed IFF all filters 69 + * return 1. If any filter returns denied, opcode will be denied. 70 + */ 71 + do { 72 + if (filter == &dummy_filter) 73 + return -EACCES; 74 + ret = bpf_prog_run(filter->prog, &bpf_ctx); 75 + if (!ret) 76 + return -EACCES; 77 + filter = filter->next; 78 + } while (filter); 79 + 80 + return 0; 81 + } 82 + 83 + static void io_free_bpf_filters(struct rcu_head *head) 84 + { 85 + struct io_bpf_filter __rcu **filter; 86 + struct io_bpf_filters *filters; 87 + int i; 88 + 89 + filters = container_of(head, struct io_bpf_filters, rcu_head); 90 + scoped_guard(spinlock, &filters->lock) { 91 + filter = filters->filters; 92 + if (!filter) 93 + return; 94 + } 95 + 96 + for (i = 0; i < IORING_OP_LAST; i++) { 97 + struct io_bpf_filter *f; 98 + 99 + rcu_read_lock(); 100 + f = rcu_dereference(filter[i]); 101 + while (f) { 102 + struct io_bpf_filter *next = f->next; 103 + 104 + /* 105 + * Even if stacked, dummy filter will always be last 106 + * as it can only get installed into an empty spot. 107 + */ 108 + if (f == &dummy_filter) 109 + break; 110 + bpf_prog_destroy(f->prog); 111 + kfree(f); 112 + f = next; 113 + } 114 + rcu_read_unlock(); 115 + } 116 + kfree(filters->filters); 117 + kfree(filters); 118 + } 119 + 120 + static void __io_put_bpf_filters(struct io_bpf_filters *filters) 121 + { 122 + if (refcount_dec_and_test(&filters->refs)) 123 + call_rcu(&filters->rcu_head, io_free_bpf_filters); 124 + } 125 + 126 + void io_put_bpf_filters(struct io_restriction *res) 127 + { 128 + if (res->bpf_filters) 129 + __io_put_bpf_filters(res->bpf_filters); 130 + } 131 + 132 + static struct io_bpf_filters *io_new_bpf_filters(void) 133 + { 134 + struct io_bpf_filters *filters __free(kfree) = NULL; 135 + 136 + filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT); 137 + if (!filters) 138 + return ERR_PTR(-ENOMEM); 139 + 140 + filters->filters = kcalloc(IORING_OP_LAST, 141 + sizeof(struct io_bpf_filter *), 142 + GFP_KERNEL_ACCOUNT); 143 + if (!filters->filters) 144 + return ERR_PTR(-ENOMEM); 145 + 146 + refcount_set(&filters->refs, 1); 147 + spin_lock_init(&filters->lock); 148 + return no_free_ptr(filters); 149 + } 150 + 151 + /* 152 + * Validate classic BPF filter instructions. Only allow a safe subset of 153 + * operations - no packet data access, just context field loads and basic 154 + * ALU/jump operations. 155 + */ 156 + static int io_uring_check_cbpf_filter(struct sock_filter *filter, 157 + unsigned int flen) 158 + { 159 + int pc; 160 + 161 + for (pc = 0; pc < flen; pc++) { 162 + struct sock_filter *ftest = &filter[pc]; 163 + u16 code = ftest->code; 164 + u32 k = ftest->k; 165 + 166 + switch (code) { 167 + case BPF_LD | BPF_W | BPF_ABS: 168 + ftest->code = BPF_LDX | BPF_W | BPF_ABS; 169 + /* 32-bit aligned and not out of bounds. */ 170 + if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3) 171 + return -EINVAL; 172 + continue; 173 + case BPF_LD | BPF_W | BPF_LEN: 174 + ftest->code = BPF_LD | BPF_IMM; 175 + ftest->k = sizeof(struct io_uring_bpf_ctx); 176 + continue; 177 + case BPF_LDX | BPF_W | BPF_LEN: 178 + ftest->code = BPF_LDX | BPF_IMM; 179 + ftest->k = sizeof(struct io_uring_bpf_ctx); 180 + continue; 181 + /* Explicitly include allowed calls. */ 182 + case BPF_RET | BPF_K: 183 + case BPF_RET | BPF_A: 184 + case BPF_ALU | BPF_ADD | BPF_K: 185 + case BPF_ALU | BPF_ADD | BPF_X: 186 + case BPF_ALU | BPF_SUB | BPF_K: 187 + case BPF_ALU | BPF_SUB | BPF_X: 188 + case BPF_ALU | BPF_MUL | BPF_K: 189 + case BPF_ALU | BPF_MUL | BPF_X: 190 + case BPF_ALU | BPF_DIV | BPF_K: 191 + case BPF_ALU | BPF_DIV | BPF_X: 192 + case BPF_ALU | BPF_AND | BPF_K: 193 + case BPF_ALU | BPF_AND | BPF_X: 194 + case BPF_ALU | BPF_OR | BPF_K: 195 + case BPF_ALU | BPF_OR | BPF_X: 196 + case BPF_ALU | BPF_XOR | BPF_K: 197 + case BPF_ALU | BPF_XOR | BPF_X: 198 + case BPF_ALU | BPF_LSH | BPF_K: 199 + case BPF_ALU | BPF_LSH | BPF_X: 200 + case BPF_ALU | BPF_RSH | BPF_K: 201 + case BPF_ALU | BPF_RSH | BPF_X: 202 + case BPF_ALU | BPF_NEG: 203 + case BPF_LD | BPF_IMM: 204 + case BPF_LDX | BPF_IMM: 205 + case BPF_MISC | BPF_TAX: 206 + case BPF_MISC | BPF_TXA: 207 + case BPF_LD | BPF_MEM: 208 + case BPF_LDX | BPF_MEM: 209 + case BPF_ST: 210 + case BPF_STX: 211 + case BPF_JMP | BPF_JA: 212 + case BPF_JMP | BPF_JEQ | BPF_K: 213 + case BPF_JMP | BPF_JEQ | BPF_X: 214 + case BPF_JMP | BPF_JGE | BPF_K: 215 + case BPF_JMP | BPF_JGE | BPF_X: 216 + case BPF_JMP | BPF_JGT | BPF_K: 217 + case BPF_JMP | BPF_JGT | BPF_X: 218 + case BPF_JMP | BPF_JSET | BPF_K: 219 + case BPF_JMP | BPF_JSET | BPF_X: 220 + continue; 221 + default: 222 + return -EINVAL; 223 + } 224 + } 225 + return 0; 226 + } 227 + 228 + #define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST 229 + 230 + int io_register_bpf_filter(struct io_restriction *res, 231 + struct io_uring_bpf __user *arg) 232 + { 233 + struct io_bpf_filter *filter, *old_filter; 234 + struct io_bpf_filters *filters; 235 + struct io_uring_bpf reg; 236 + struct bpf_prog *prog; 237 + struct sock_fprog fprog; 238 + int ret; 239 + 240 + if (copy_from_user(&reg, arg, sizeof(reg))) 241 + return -EFAULT; 242 + if (reg.cmd_type != IO_URING_BPF_CMD_FILTER) 243 + return -EINVAL; 244 + if (reg.cmd_flags || reg.resv) 245 + return -EINVAL; 246 + 247 + if (reg.filter.opcode >= IORING_OP_LAST) 248 + return -EINVAL; 249 + if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS) 250 + return -EINVAL; 251 + if (reg.filter.resv) 252 + return -EINVAL; 253 + if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2))) 254 + return -EINVAL; 255 + if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS) 256 + return -EINVAL; 257 + 258 + fprog.len = reg.filter.filter_len; 259 + fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr); 260 + 261 + ret = bpf_prog_create_from_user(&prog, &fprog, 262 + io_uring_check_cbpf_filter, false); 263 + if (ret) 264 + return ret; 265 + 266 + /* 267 + * No existing filters, allocate set. 268 + */ 269 + filters = res->bpf_filters; 270 + if (!filters) { 271 + filters = io_new_bpf_filters(); 272 + if (IS_ERR(filters)) { 273 + ret = PTR_ERR(filters); 274 + goto err_prog; 275 + } 276 + } 277 + 278 + filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT); 279 + if (!filter) { 280 + ret = -ENOMEM; 281 + goto err; 282 + } 283 + filter->prog = prog; 284 + res->bpf_filters = filters; 285 + 286 + /* 287 + * Insert filter - if the current opcode already has a filter 288 + * attached, add to the set. 289 + */ 290 + rcu_read_lock(); 291 + spin_lock_bh(&filters->lock); 292 + old_filter = rcu_dereference(filters->filters[reg.filter.opcode]); 293 + if (old_filter) 294 + filter->next = old_filter; 295 + rcu_assign_pointer(filters->filters[reg.filter.opcode], filter); 296 + 297 + /* 298 + * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered 299 + * opcode with the dummy filter. That will cause them to be denied. 300 + */ 301 + if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) { 302 + for (int i = 0; i < IORING_OP_LAST; i++) { 303 + if (i == reg.filter.opcode) 304 + continue; 305 + old_filter = rcu_dereference(filters->filters[i]); 306 + if (old_filter) 307 + continue; 308 + rcu_assign_pointer(filters->filters[i], &dummy_filter); 309 + } 310 + } 311 + 312 + spin_unlock_bh(&filters->lock); 313 + rcu_read_unlock(); 314 + return 0; 315 + err: 316 + if (filters != res->bpf_filters) 317 + __io_put_bpf_filters(filters); 318 + err_prog: 319 + bpf_prog_destroy(prog); 320 + return ret; 321 + }
+42
io_uring/bpf_filter.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IO_URING_BPF_FILTER_H 3 + #define IO_URING_BPF_FILTER_H 4 + 5 + #include <uapi/linux/io_uring/bpf_filter.h> 6 + 7 + #ifdef CONFIG_IO_URING_BPF 8 + 9 + int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req); 10 + 11 + int io_register_bpf_filter(struct io_restriction *res, 12 + struct io_uring_bpf __user *arg); 13 + 14 + void io_put_bpf_filters(struct io_restriction *res); 15 + 16 + static inline int io_uring_run_bpf_filters(struct io_restriction *res, 17 + struct io_kiocb *req) 18 + { 19 + if (res->bpf_filters) 20 + return __io_uring_run_bpf_filters(res, req); 21 + 22 + return 0; 23 + } 24 + 25 + #else 26 + 27 + static inline int io_register_bpf_filter(struct io_restriction *res, 28 + struct io_uring_bpf __user *arg) 29 + { 30 + return -EINVAL; 31 + } 32 + static inline int io_uring_run_bpf_filters(struct io_restriction *res, 33 + struct io_kiocb *req) 34 + { 35 + return 0; 36 + } 37 + static inline void io_put_bpf_filters(struct io_restriction *res) 38 + { 39 + } 40 + #endif /* CONFIG_IO_URING_BPF */ 41 + 42 + #endif
+8
io_uring/io_uring.c
··· 94 94 #include "alloc_cache.h" 95 95 #include "eventfd.h" 96 96 #include "wait.h" 97 + #include "bpf_filter.h" 97 98 98 99 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 99 100 IOSQE_IO_HARDLINK | IOSQE_ASYNC) ··· 1875 1874 if (unlikely(ret)) 1876 1875 return io_submit_fail_init(sqe, req, ret); 1877 1876 1877 + if (unlikely(ctx->restrictions.bpf_filters)) { 1878 + ret = io_uring_run_bpf_filters(&ctx->restrictions, req); 1879 + if (ret) 1880 + return io_submit_fail_init(sqe, req, ret); 1881 + } 1882 + 1878 1883 trace_io_uring_submit_req(req); 1879 1884 1880 1885 /* ··· 2168 2161 percpu_ref_exit(&ctx->refs); 2169 2162 free_uid(ctx->user); 2170 2163 io_req_caches_free(ctx); 2164 + io_put_bpf_filters(&ctx->restrictions); 2171 2165 2172 2166 WARN_ON_ONCE(ctx->nr_req_allocated); 2173 2167
+8
io_uring/register.c
··· 33 33 #include "memmap.h" 34 34 #include "zcrx.h" 35 35 #include "query.h" 36 + #include "bpf_filter.h" 36 37 37 38 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 38 39 IORING_REGISTER_LAST + IORING_OP_LAST) ··· 830 829 break; 831 830 case IORING_REGISTER_ZCRX_CTRL: 832 831 ret = io_zcrx_ctrl(ctx, arg, nr_args); 832 + break; 833 + case IORING_REGISTER_BPF_FILTER: 834 + ret = -EINVAL; 835 + 836 + if (nr_args != 1) 837 + break; 838 + ret = io_register_bpf_filter(&ctx->restrictions, arg); 833 839 break; 834 840 default: 835 841 ret = -EINVAL;