Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: allow registration of per-task restrictions

Currently io_uring supports restricting operations on a per-ring basis.
To use those, the ring must be setup in a disabled state by setting
IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and
the ring can then be enabled.

This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd
== -1, like the other "blind" register opcodes which work on the task
rather than a specific ring. This allows registration of the same kind
of restrictions as can been done on a specific ring, but with the task
itself. Once done, any ring created will inherit these restrictions.

If a restriction filter is registered with a task, then it's inherited
on fork for its children. Children may only further restrict operations,
not extend them.

Inheriting restrictions include both the classic
IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF
filters that have been registered with the task via
IORING_REGISTER_BPF_FILTER.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+231 -1
+2
include/linux/io_uring_types.h
··· 231 231 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 232 232 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 233 233 struct io_bpf_filters *bpf_filters; 234 + /* ->bpf_filters needs COW on modification */ 235 + bool bpf_filters_cow; 234 236 u8 sqe_flags_allowed; 235 237 u8 sqe_flags_required; 236 238 /* IORING_OP_* restrictions exist */
+7
include/uapi/linux/io_uring.h
··· 808 808 __u32 resv2[3]; 809 809 }; 810 810 811 + struct io_uring_task_restriction { 812 + __u16 flags; 813 + __u16 nr_res; 814 + __u32 resv[3]; 815 + __DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions); 816 + }; 817 + 811 818 struct io_uring_clock_register { 812 819 __u32 clockid; 813 820 __u32 __resv[3];
+85 -1
io_uring/bpf_filter.c
··· 249 249 return 0; 250 250 } 251 251 252 + void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src) 253 + { 254 + if (!src->bpf_filters) 255 + return; 256 + 257 + rcu_read_lock(); 258 + /* 259 + * If the src filter is going away, just ignore it. 260 + */ 261 + if (refcount_inc_not_zero(&src->bpf_filters->refs)) { 262 + dst->bpf_filters = src->bpf_filters; 263 + dst->bpf_filters_cow = true; 264 + } 265 + rcu_read_unlock(); 266 + } 267 + 268 + /* 269 + * Allocate a new struct io_bpf_filters. Used when a filter is cloned and 270 + * modifications need to be made. 271 + */ 272 + static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src) 273 + { 274 + struct io_bpf_filters *filters; 275 + struct io_bpf_filter *srcf; 276 + int i; 277 + 278 + filters = io_new_bpf_filters(); 279 + if (IS_ERR(filters)) 280 + return filters; 281 + 282 + /* 283 + * Iterate filters from src and assign in destination. Grabbing 284 + * a reference is enough, we don't need to duplicate the memory. 285 + * This is safe because filters are only ever appended to the 286 + * front of the list, hence the only memory ever touched inside 287 + * a filter is the refcount. 288 + */ 289 + rcu_read_lock(); 290 + for (i = 0; i < IORING_OP_LAST; i++) { 291 + srcf = rcu_dereference(src->bpf_filters->filters[i]); 292 + if (!srcf) { 293 + continue; 294 + } else if (srcf == &dummy_filter) { 295 + rcu_assign_pointer(filters->filters[i], &dummy_filter); 296 + continue; 297 + } 298 + 299 + /* 300 + * Getting a ref on the first node is enough, putting the 301 + * filter and iterating nodes to free will stop on the first 302 + * one that doesn't hit zero when dropping. 303 + */ 304 + if (!refcount_inc_not_zero(&srcf->refs)) 305 + goto err; 306 + rcu_assign_pointer(filters->filters[i], srcf); 307 + } 308 + rcu_read_unlock(); 309 + return filters; 310 + err: 311 + rcu_read_unlock(); 312 + __io_put_bpf_filters(filters); 313 + return ERR_PTR(-EBUSY); 314 + } 315 + 252 316 #define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST 253 317 254 318 int io_register_bpf_filter(struct io_restriction *res, 255 319 struct io_uring_bpf __user *arg) 256 320 { 321 + struct io_bpf_filters *filters, *old_filters = NULL; 257 322 struct io_bpf_filter *filter, *old_filter; 258 - struct io_bpf_filters *filters; 259 323 struct io_uring_bpf reg; 260 324 struct bpf_prog *prog; 261 325 struct sock_fprog fprog; ··· 361 297 ret = PTR_ERR(filters); 362 298 goto err_prog; 363 299 } 300 + } else if (res->bpf_filters_cow) { 301 + filters = io_bpf_filter_cow(res); 302 + if (IS_ERR(filters)) { 303 + ret = PTR_ERR(filters); 304 + goto err_prog; 305 + } 306 + /* 307 + * Stash old filters, we'll put them once we know we'll 308 + * succeed. Until then, res->bpf_filters is left untouched. 309 + */ 310 + old_filters = res->bpf_filters; 364 311 } 365 312 366 313 filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT); ··· 381 306 } 382 307 refcount_set(&filter->refs, 1); 383 308 filter->prog = prog; 309 + 310 + /* 311 + * Success - install the new filter set now. If we did COW, put 312 + * the old filters as we're replacing them. 313 + */ 314 + if (old_filters) { 315 + __io_put_bpf_filters(old_filters); 316 + res->bpf_filters_cow = false; 317 + } 384 318 res->bpf_filters = filters; 385 319 386 320 /*
+6
io_uring/bpf_filter.h
··· 13 13 14 14 void io_put_bpf_filters(struct io_restriction *res); 15 15 16 + void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src); 17 + 16 18 static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters, 17 19 struct io_kiocb *req) 18 20 { ··· 37 35 return 0; 38 36 } 39 37 static inline void io_put_bpf_filters(struct io_restriction *res) 38 + { 39 + } 40 + static inline void io_bpf_filter_clone(struct io_restriction *dst, 41 + struct io_restriction *src) 40 42 { 41 43 } 42 44 #endif /* CONFIG_IO_URING_BPF */
+33
io_uring/io_uring.c
··· 2880 2880 return 0; 2881 2881 } 2882 2882 2883 + void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src) 2884 + { 2885 + memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op)); 2886 + memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op)); 2887 + dst->sqe_flags_allowed = src->sqe_flags_allowed; 2888 + dst->sqe_flags_required = src->sqe_flags_required; 2889 + dst->op_registered = src->op_registered; 2890 + dst->reg_registered = src->reg_registered; 2891 + 2892 + io_bpf_filter_clone(dst, src); 2893 + } 2894 + 2895 + static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, 2896 + struct io_restriction *src) 2897 + { 2898 + struct io_restriction *dst = &ctx->restrictions; 2899 + 2900 + io_restriction_clone(dst, src); 2901 + if (dst->bpf_filters) 2902 + WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); 2903 + if (dst->op_registered) 2904 + ctx->op_restricted = 1; 2905 + if (dst->reg_registered) 2906 + ctx->reg_restricted = 1; 2907 + } 2908 + 2883 2909 static __cold int io_uring_create(struct io_ctx_config *config) 2884 2910 { 2885 2911 struct io_uring_params *p = &config->p; ··· 2965 2939 ctx->notify_method = TWA_SIGNAL_NO_IPI; 2966 2940 else 2967 2941 ctx->notify_method = TWA_SIGNAL; 2942 + 2943 + /* 2944 + * If the current task has restrictions enabled, then copy them to 2945 + * our newly created ring and mark it as registered. 2946 + */ 2947 + if (current->io_uring_restrict) 2948 + io_ctx_restriction_clone(ctx, current->io_uring_restrict); 2968 2949 2969 2950 /* 2970 2951 * This is just grabbed for accounting purposes. When a process exits,
+1
io_uring/io_uring.h
··· 197 197 bool __io_alloc_req_refill(struct io_ring_ctx *ctx); 198 198 199 199 void io_activate_pollwq(struct io_ring_ctx *ctx); 200 + void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src); 200 201 201 202 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) 202 203 {
+80
io_uring/register.c
··· 190 190 return 0; 191 191 } 192 192 193 + static int io_register_restrictions_task(void __user *arg, unsigned int nr_args) 194 + { 195 + struct io_uring_task_restriction __user *ures = arg; 196 + struct io_uring_task_restriction tres; 197 + struct io_restriction *res; 198 + int ret; 199 + 200 + /* Disallow if task already has registered restrictions */ 201 + if (current->io_uring_restrict) 202 + return -EPERM; 203 + /* 204 + * Similar to seccomp, disallow setting a filter if task_no_new_privs 205 + * is true and we're not CAP_SYS_ADMIN. 206 + */ 207 + if (!task_no_new_privs(current) && 208 + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) 209 + return -EACCES; 210 + if (nr_args != 1) 211 + return -EINVAL; 212 + 213 + if (copy_from_user(&tres, arg, sizeof(tres))) 214 + return -EFAULT; 215 + 216 + if (tres.flags) 217 + return -EINVAL; 218 + if (!mem_is_zero(tres.resv, sizeof(tres.resv))) 219 + return -EINVAL; 220 + 221 + res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT); 222 + if (!res) 223 + return -ENOMEM; 224 + 225 + ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res); 226 + if (ret < 0) { 227 + kfree(res); 228 + return ret; 229 + } 230 + current->io_uring_restrict = res; 231 + return 0; 232 + } 233 + 234 + static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args) 235 + { 236 + struct io_restriction *res; 237 + int ret; 238 + 239 + /* 240 + * Similar to seccomp, disallow setting a filter if task_no_new_privs 241 + * is true and we're not CAP_SYS_ADMIN. 242 + */ 243 + if (!task_no_new_privs(current) && 244 + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) 245 + return -EACCES; 246 + 247 + if (nr_args != 1) 248 + return -EINVAL; 249 + 250 + /* If no task restrictions exist, setup a new set */ 251 + res = current->io_uring_restrict; 252 + if (!res) { 253 + res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT); 254 + if (!res) 255 + return -ENOMEM; 256 + } 257 + 258 + ret = io_register_bpf_filter(res, arg); 259 + if (ret) { 260 + if (res != current->io_uring_restrict) 261 + kfree(res); 262 + return ret; 263 + } 264 + if (!current->io_uring_restrict) 265 + current->io_uring_restrict = res; 266 + return 0; 267 + } 268 + 193 269 static int io_register_enable_rings(struct io_ring_ctx *ctx) 194 270 { 195 271 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) ··· 988 912 return io_uring_register_send_msg_ring(arg, nr_args); 989 913 case IORING_REGISTER_QUERY: 990 914 return io_query(arg, nr_args); 915 + case IORING_REGISTER_RESTRICTIONS: 916 + return io_register_restrictions_task(arg, nr_args); 917 + case IORING_REGISTER_BPF_FILTER: 918 + return io_register_bpf_filter_task(arg, nr_args); 991 919 } 992 920 return -EINVAL; 993 921 }
+17
io_uring/tctx.c
··· 11 11 12 12 #include "io_uring.h" 13 13 #include "tctx.h" 14 + #include "bpf_filter.h" 14 15 15 16 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 16 17 struct task_struct *task) ··· 66 65 percpu_counter_destroy(&tctx->inflight); 67 66 kfree(tctx); 68 67 tsk->io_uring = NULL; 68 + } 69 + if (tsk->io_uring_restrict) { 70 + io_put_bpf_filters(tsk->io_uring_restrict); 71 + kfree(tsk->io_uring_restrict); 72 + tsk->io_uring_restrict = NULL; 69 73 } 70 74 } 71 75 ··· 362 356 363 357 int __io_uring_fork(struct task_struct *tsk) 364 358 { 359 + struct io_restriction *res, *src = tsk->io_uring_restrict; 360 + 361 + /* Don't leave it dangling on error */ 362 + tsk->io_uring_restrict = NULL; 363 + 364 + res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT); 365 + if (!res) 366 + return -ENOMEM; 367 + 368 + tsk->io_uring_restrict = res; 369 + io_restriction_clone(res, src); 365 370 return 0; 366 371 }