Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'loop-aio-nowait' into for-6.19/block

Merge async IO IOCB_NOWAIT support from Ming:

"This patchset improves loop aio perf by using IOCB_NOWAIT for avoiding
to queue aio command to workqueue context, meantime refactor
lo_rw_aio() a bit.

In my test VM, loop disk perf becomes very close to perf of the backing
block device(nvme/mq virtio-scsi).

And Mikulas verified that this way can improve 12jobs sequential
readwrite io by ~5X, and basically solve the reported problem together
with loop MQ change.

https://lore.kernel.org/linux-block/a8e5c76a-231f-07d1-a394-847de930f638@redhat.com/

Zhaoyang Huang also mentioned it may fix their performance issue on
Android use case.

The loop MQ change will be posted as standalone patch, because it needs
UAPI change."

Link: https://lore.kernel.org/linux-block/20251015110735.1361261-1-ming.lei@redhat.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>

* loop-aio-nowait:
loop: add hint for handling aio via IOCB_NOWAIT
loop: try to handle loop aio command via NOWAIT IO first
loop: move command blkcg/memcg initialization into loop_queue_work
loop: add lo_submit_rw_aio()
loop: add helper lo_rw_aio_prep()
loop: add helper lo_cmd_nr_bvec()

+194 -39
+194 -39
drivers/block/loop.c
··· 68 68 struct rb_root worker_tree; 69 69 struct timer_list timer; 70 70 bool sysfs_inited; 71 + unsigned lo_nr_blocking_writes; 71 72 72 73 struct request_queue *lo_queue; 73 74 struct blk_mq_tag_set tag_set; ··· 90 89 91 90 #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) 92 91 #define LOOP_DEFAULT_HW_Q_DEPTH 128 92 + 93 + static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd); 93 94 94 95 static DEFINE_IDR(loop_index_idr); 95 96 static DEFINE_MUTEX(loop_ctl_mutex); ··· 324 321 325 322 if (!atomic_dec_and_test(&cmd->ref)) 326 323 return; 324 + 325 + /* -EAGAIN could be returned from bdev's ->ki_complete */ 326 + if (cmd->ret == -EAGAIN) { 327 + struct loop_device *lo = rq->q->queuedata; 328 + 329 + loop_queue_work(lo, cmd); 330 + return; 331 + } 332 + 327 333 kfree(cmd->bvec); 328 334 cmd->bvec = NULL; 329 335 if (req_op(rq) == REQ_OP_WRITE) ··· 349 337 lo_rw_aio_do_completion(cmd); 350 338 } 351 339 352 - static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, 353 - loff_t pos, int rw) 340 + static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd) 354 341 { 355 - struct iov_iter iter; 356 - struct req_iterator rq_iter; 357 - struct bio_vec *bvec; 358 342 struct request *rq = blk_mq_rq_from_pdu(cmd); 359 - struct bio *bio = rq->bio; 360 - struct file *file = lo->lo_backing_file; 343 + struct req_iterator rq_iter; 361 344 struct bio_vec tmp; 362 - unsigned int offset; 363 345 int nr_bvec = 0; 364 - int ret; 365 346 366 347 rq_for_each_bvec(tmp, rq, rq_iter) 367 348 nr_bvec++; 368 349 350 + return nr_bvec; 351 + } 352 + 353 + static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, 354 + unsigned nr_bvec, loff_t pos) 355 + { 356 + struct request *rq = blk_mq_rq_from_pdu(cmd); 357 + 369 358 if (rq->bio != rq->biotail) { 359 + struct req_iterator rq_iter; 360 + struct bio_vec *bvec; 361 + struct bio_vec tmp; 370 362 371 363 bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), 372 364 GFP_NOIO); ··· 388 372 *bvec = tmp; 389 373 bvec++; 390 374 } 391 - bvec = cmd->bvec; 392 - offset = 0; 393 375 } else { 394 - /* 395 - * Same here, this bio may be started from the middle of the 396 - * 'bvec' because of bio splitting, so offset from the bvec 397 - * must be passed to iov iterator 398 - */ 399 - offset = bio->bi_iter.bi_bvec_done; 400 - bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); 376 + cmd->bvec = NULL; 401 377 } 402 - atomic_set(&cmd->ref, 2); 403 - 404 - iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); 405 - iter.iov_offset = offset; 406 378 407 379 cmd->iocb.ki_pos = pos; 408 - cmd->iocb.ki_filp = file; 380 + cmd->iocb.ki_filp = lo->lo_backing_file; 409 381 cmd->iocb.ki_ioprio = req_get_ioprio(rq); 410 382 if (cmd->use_aio) { 411 383 cmd->iocb.ki_complete = lo_rw_aio_complete; ··· 402 398 cmd->iocb.ki_complete = NULL; 403 399 cmd->iocb.ki_flags = 0; 404 400 } 401 + return 0; 402 + } 403 + 404 + static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, 405 + int nr_bvec, int rw) 406 + { 407 + struct request *rq = blk_mq_rq_from_pdu(cmd); 408 + struct file *file = lo->lo_backing_file; 409 + struct iov_iter iter; 410 + int ret; 411 + 412 + if (cmd->bvec) { 413 + iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); 414 + iter.iov_offset = 0; 415 + } else { 416 + struct bio *bio = rq->bio; 417 + struct bio_vec *bvec = __bvec_iter_bvec(bio->bi_io_vec, 418 + bio->bi_iter); 419 + 420 + /* 421 + * Same here, this bio may be started from the middle of the 422 + * 'bvec' because of bio splitting, so offset from the bvec 423 + * must be passed to iov iterator 424 + */ 425 + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); 426 + iter.iov_offset = bio->bi_iter.bi_bvec_done; 427 + } 428 + atomic_set(&cmd->ref, 2); 429 + 405 430 406 431 if (rw == ITER_SOURCE) { 407 432 kiocb_start_write(&cmd->iocb); ··· 439 406 ret = file->f_op->read_iter(&cmd->iocb, &iter); 440 407 441 408 lo_rw_aio_do_completion(cmd); 409 + return ret; 410 + } 442 411 412 + static bool lo_backfile_support_nowait(const struct loop_device *lo) 413 + { 414 + return lo->lo_backing_file->f_mode & FMODE_NOWAIT; 415 + } 416 + 417 + static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, 418 + loff_t pos, int rw) 419 + { 420 + int nr_bvec = lo_cmd_nr_bvec(cmd); 421 + int ret; 422 + 423 + /* prepared already if we have tried nowait */ 424 + if (!cmd->use_aio || !lo_backfile_support_nowait(lo)) { 425 + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); 426 + if (unlikely(ret)) 427 + goto fail; 428 + } 429 + 430 + cmd->iocb.ki_flags &= ~IOCB_NOWAIT; 431 + ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); 432 + fail: 443 433 if (ret != -EIOCBQUEUED) 444 434 lo_rw_aio_complete(&cmd->iocb, ret); 445 435 return -EIOCBQUEUED; 436 + } 437 + 438 + static inline bool lo_aio_try_nowait(struct loop_device *lo, 439 + struct loop_cmd *cmd) 440 + { 441 + struct file *file = lo->lo_backing_file; 442 + struct inode *inode = file->f_mapping->host; 443 + struct request *rq = blk_mq_rq_from_pdu(cmd); 444 + 445 + /* NOWAIT works fine for backing block device */ 446 + if (S_ISBLK(inode->i_mode)) 447 + return true; 448 + 449 + /* 450 + * NOWAIT is supposed to be fine for READ without contending with 451 + * blocking WRITE 452 + */ 453 + if (req_op(rq) == REQ_OP_READ) 454 + return true; 455 + 456 + /* 457 + * If there is any queued non-NOWAIT async WRITE , don't try new 458 + * NOWAIT WRITE for avoiding contention 459 + * 460 + * Here we focus on handling stable FS block mapping via NOWAIT 461 + */ 462 + return READ_ONCE(lo->lo_nr_blocking_writes) == 0; 463 + } 464 + 465 + static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, 466 + int rw) 467 + { 468 + struct request *rq = blk_mq_rq_from_pdu(cmd); 469 + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; 470 + int nr_bvec = lo_cmd_nr_bvec(cmd); 471 + int ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); 472 + 473 + if (unlikely(ret)) 474 + goto fail; 475 + 476 + if (!lo_aio_try_nowait(lo, cmd)) 477 + return -EAGAIN; 478 + 479 + cmd->iocb.ki_flags |= IOCB_NOWAIT; 480 + ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); 481 + fail: 482 + if (ret != -EIOCBQUEUED && ret != -EAGAIN) 483 + lo_rw_aio_complete(&cmd->iocb, ret); 484 + return ret; 446 485 } 447 486 448 487 static int do_req_filebacked(struct loop_device *lo, struct request *rq) ··· 811 706 return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); 812 707 } 813 708 709 + static ssize_t loop_attr_nr_blocking_writes_show(struct loop_device *lo, 710 + char *buf) 711 + { 712 + return sysfs_emit(buf, "%u\n", lo->lo_nr_blocking_writes); 713 + } 714 + 814 715 LOOP_ATTR_RO(backing_file); 815 716 LOOP_ATTR_RO(offset); 816 717 LOOP_ATTR_RO(sizelimit); 817 718 LOOP_ATTR_RO(autoclear); 818 719 LOOP_ATTR_RO(partscan); 819 720 LOOP_ATTR_RO(dio); 721 + LOOP_ATTR_RO(nr_blocking_writes); 820 722 821 723 static struct attribute *loop_attrs[] = { 822 724 &loop_attr_backing_file.attr, ··· 832 720 &loop_attr_autoclear.attr, 833 721 &loop_attr_partscan.attr, 834 722 &loop_attr_dio.attr, 723 + &loop_attr_nr_blocking_writes.attr, 835 724 NULL, 836 725 }; 837 726 ··· 908 795 } 909 796 #endif 910 797 798 + static inline void loop_inc_blocking_writes(struct loop_device *lo, 799 + struct loop_cmd *cmd) 800 + { 801 + lockdep_assert_held(&lo->lo_work_lock); 802 + 803 + if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) 804 + lo->lo_nr_blocking_writes += 1; 805 + } 806 + 807 + static inline void loop_dec_blocking_writes(struct loop_device *lo, 808 + struct loop_cmd *cmd) 809 + { 810 + lockdep_assert_held(&lo->lo_work_lock); 811 + 812 + if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) 813 + lo->lo_nr_blocking_writes -= 1; 814 + } 815 + 911 816 static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) 912 817 { 818 + struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd); 913 819 struct rb_node **node, *parent = NULL; 914 820 struct loop_worker *cur_worker, *worker = NULL; 915 821 struct work_struct *work; 916 822 struct list_head *cmd_list; 823 + 824 + /* always use the first bio's css */ 825 + cmd->blkcg_css = NULL; 826 + cmd->memcg_css = NULL; 827 + #ifdef CONFIG_BLK_CGROUP 828 + if (rq->bio) { 829 + cmd->blkcg_css = bio_blkcg_css(rq->bio); 830 + #ifdef CONFIG_MEMCG 831 + if (cmd->blkcg_css) { 832 + cmd->memcg_css = 833 + cgroup_get_e_css(cmd->blkcg_css->cgroup, 834 + &memory_cgrp_subsys); 835 + } 836 + #endif 837 + } 838 + #endif 917 839 918 840 spin_lock_irq(&lo->lo_work_lock); 919 841 ··· 1008 860 work = &lo->rootcg_work; 1009 861 cmd_list = &lo->rootcg_cmd_list; 1010 862 } 863 + if (cmd->use_aio) 864 + loop_inc_blocking_writes(lo, cmd); 1011 865 list_add_tail(&cmd->list_entry, cmd_list); 1012 866 queue_work(lo->workqueue, work); 1013 867 spin_unlock_irq(&lo->lo_work_lock); ··· 2006 1856 struct request *rq = bd->rq; 2007 1857 struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); 2008 1858 struct loop_device *lo = rq->q->queuedata; 1859 + int rw = 0; 2009 1860 2010 1861 blk_mq_start_request(rq); 2011 1862 ··· 2019 1868 case REQ_OP_WRITE_ZEROES: 2020 1869 cmd->use_aio = false; 2021 1870 break; 2022 - default: 1871 + case REQ_OP_READ: 1872 + rw = ITER_DEST; 2023 1873 cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; 2024 1874 break; 1875 + case REQ_OP_WRITE: 1876 + rw = ITER_SOURCE; 1877 + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; 1878 + break; 1879 + default: 1880 + return BLK_STS_IOERR; 2025 1881 } 2026 1882 2027 - /* always use the first bio's css */ 2028 - cmd->blkcg_css = NULL; 2029 - cmd->memcg_css = NULL; 2030 - #ifdef CONFIG_BLK_CGROUP 2031 - if (rq->bio) { 2032 - cmd->blkcg_css = bio_blkcg_css(rq->bio); 2033 - #ifdef CONFIG_MEMCG 2034 - if (cmd->blkcg_css) { 2035 - cmd->memcg_css = 2036 - cgroup_get_e_css(cmd->blkcg_css->cgroup, 2037 - &memory_cgrp_subsys); 2038 - } 2039 - #endif 1883 + /* try NOWAIT if the backing file supports the mode */ 1884 + if (cmd->use_aio && lo_backfile_support_nowait(lo)) { 1885 + int res = lo_rw_aio_nowait(lo, cmd, rw); 1886 + 1887 + if (res != -EAGAIN && res != -EOPNOTSUPP) 1888 + return BLK_STS_OK; 1889 + /* fallback to workqueue for handling aio */ 2040 1890 } 2041 - #endif 1891 + 2042 1892 loop_queue_work(lo, cmd); 2043 1893 2044 1894 return BLK_STS_OK; ··· 2111 1959 cond_resched(); 2112 1960 2113 1961 spin_lock_irq(&lo->lo_work_lock); 1962 + if (cmd->use_aio) 1963 + loop_dec_blocking_writes(lo, cmd); 2114 1964 } 2115 1965 2116 1966 /* ··· 2191 2037 lo->tag_set.queue_depth = hw_queue_depth; 2192 2038 lo->tag_set.numa_node = NUMA_NO_NODE; 2193 2039 lo->tag_set.cmd_size = sizeof(struct loop_cmd); 2194 - lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; 2040 + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT | 2041 + BLK_MQ_F_BLOCKING; 2195 2042 lo->tag_set.driver_data = lo; 2196 2043 2197 2044 err = blk_mq_alloc_tag_set(&lo->tag_set);