io_uring/kbuf: Use slab for struct io_buffer objects

The allocation of struct io_buffer for metadata of provided buffers is
done through a custom allocator that directly gets pages and
fragments them. But, slab would do just fine, as this is not a hot path
(in fact, it is a deprecated feature) and, by keeping a custom allocator
implementation we lose benefits like tracking, poisoning,
sanitizers. Finally, the custom code is more complex and requires
keeping the list of pages in struct ctx for no good reason. This patch
cleans this path up and just uses slab.

I microbenchmarked it by forcing the allocation of a large number of
objects with the least number of io_uring commands possible (keeping
nbufs=USHRT_MAX), with and without the patch. There is a slight
increase in time spent in the allocation with slab, of course, but even
when allocating to system resources exhaustion, which is not very
realistic and happened around 1/2 billion provided buffers for me, it
wasn't a significant hit in system time. Specially if we think of a
real-world scenario, an application doing register/unregister of
provided buffers will hit ctx->io_buffers_cache more often than actually
going to slab.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20231005000531.30800-4-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Gabriel Krisman Bertazi and committed by

Jens Axboe 2 years ago b3a4dbc8 f74c746e

+30 -24

4 changed files

expand all

include

linux

io_uring_types.h

io_uring

io_uring.c

io_uring.h

kbuf.c

-2

include/linux/io_uring_types.h

··· 350 350 struct wait_queue_head rsrc_quiesce_wq; 351 351 unsigned rsrc_quiesce; 352 352 353 - struct list_head io_buffers_pages; 354 - 355 353 #if defined(CONFIG_UNIX) 356 354 struct socket *ring_sock; 357 355 #endif

+3 -1

io_uring/io_uring.c

··· 339 339 spin_lock_init(&ctx->completion_lock); 340 340 spin_lock_init(&ctx->timeout_lock); 341 341 INIT_WQ_LIST(&ctx->iopoll_list); 342 - INIT_LIST_HEAD(&ctx->io_buffers_pages); 343 342 INIT_LIST_HEAD(&ctx->io_buffers_comp); 344 343 INIT_LIST_HEAD(&ctx->defer_list); 345 344 INIT_LIST_HEAD(&ctx->timeout_list); ··· 4719 4720 SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, 4720 4721 offsetof(struct io_kiocb, cmd.data), 4721 4722 sizeof_field(struct io_kiocb, cmd.data), NULL); 4723 + io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, 4724 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, 4725 + NULL); 4722 4726 4723 4727 #ifdef CONFIG_SYSCTL 4724 4728 register_sysctl_init("kernel", kernel_io_uring_disabled_table);

io_uring/io_uring.h

··· 330 330 } 331 331 332 332 extern struct kmem_cache *req_cachep; 333 + extern struct kmem_cache *io_buf_cachep; 333 334 334 335 static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) 335 336 {

+26 -21

io_uring/kbuf.c

··· 22 22 /* BIDs are addressed by a 16-bit field in a CQE */ 23 23 #define MAX_BIDS_PER_BGID (1 << 16) 24 24 25 + struct kmem_cache *io_buf_cachep; 26 + 25 27 struct io_provide_buf { 26 28 struct file *file; 27 29 __u64 addr; ··· 260 258 void io_destroy_buffers(struct io_ring_ctx *ctx) 261 259 { 262 260 struct io_buffer_list *bl; 261 + struct list_head *item, *tmp; 262 + struct io_buffer *buf; 263 263 unsigned long index; 264 264 int i; 265 265 ··· 277 273 kfree(bl); 278 274 } 279 275 280 - while (!list_empty(&ctx->io_buffers_pages)) { 281 - struct page *page; 282 - 283 - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); 284 - list_del_init(&page->lru); 285 - __free_page(page); 276 + list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { 277 + buf = list_entry(item, struct io_buffer, list); 278 + kmem_cache_free(io_buf_cachep, buf); 286 279 } 287 280 } 288 281 ··· 362 361 return 0; 363 362 } 364 363 364 + #define IO_BUFFER_ALLOC_BATCH 64 365 + 365 366 static int io_refill_buffer_cache(struct io_ring_ctx *ctx) 366 367 { 367 - struct io_buffer *buf; 368 - struct page *page; 369 - int bufs_in_page; 368 + struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; 369 + int allocated; 370 370 371 371 /* 372 372 * Completions that don't happen inline (eg not under uring_lock) will ··· 387 385 388 386 /* 389 387 * No free buffers and no completion entries either. Allocate a new 390 - * page worth of buffer entries and add those to our freelist. 388 + * batch of buffer entries and add those to our freelist. 391 389 */ 392 - page = alloc_page(GFP_KERNEL_ACCOUNT); 393 - if (!page) 394 - return -ENOMEM; 395 390 396 - list_add(&page->lru, &ctx->io_buffers_pages); 397 - 398 - buf = page_address(page); 399 - bufs_in_page = PAGE_SIZE / sizeof(*buf); 400 - while (bufs_in_page) { 401 - list_add_tail(&buf->list, &ctx->io_buffers_cache); 402 - buf++; 403 - bufs_in_page--; 391 + allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, 392 + ARRAY_SIZE(bufs), (void **) bufs); 393 + if (unlikely(!allocated)) { 394 + /* 395 + * Bulk alloc is all-or-nothing. If we fail to get a batch, 396 + * retry single alloc to be on the safe side. 397 + */ 398 + bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); 399 + if (!bufs[0]) 400 + return -ENOMEM; 401 + allocated = 1; 404 402 } 403 + 404 + while (allocated) 405 + list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); 405 406 406 407 return 0; 407 408 }

Configure Feed

Configure Feed