Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

block: introduce a dedicated lock for protecting queue elevator updates

A queue's elevator can be updated either when modifying nr_hw_queues
or through the sysfs scheduler attribute. Currently, elevator switching/
updating is protected using q->sysfs_lock, but this has led to lockdep
splats[1] due to inconsistent lock ordering between q->sysfs_lock and
the freeze-lock in multiple block layer call sites.

As the scope of q->sysfs_lock is not well-defined, its (mis)use has
resulted in numerous lockdep warnings. To address this, introduce a new
q->elevator_lock, dedicated specifically for protecting elevator
switches/updates. And we'd now use this new q->elevator_lock instead of
q->sysfs_lock for protecting elevator switches/updates.

While at it, make elv_iosched_load_module() a static function, as it is
only called from elv_iosched_store(). Also, remove redundant parameters
from elv_iosched_load_module() function signature.

[1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Link: https://lore.kernel.org/r/20250304102551.2533767-5-nilay@linux.ibm.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Nilay Shroff and committed by
Jens Axboe
1bf70d08 d23977fe

+61 -43
+1
block/blk-core.c
··· 429 429 430 430 refcount_set(&q->refs, 1); 431 431 mutex_init(&q->debugfs_mutex); 432 + mutex_init(&q->elevator_lock); 432 433 mutex_init(&q->sysfs_lock); 433 434 mutex_init(&q->limits_lock); 434 435 mutex_init(&q->rq_qos_mutex);
+7 -8
block/blk-mq.c
··· 4467 4467 unsigned long i, j; 4468 4468 4469 4469 /* protect against switching io scheduler */ 4470 - mutex_lock(&q->sysfs_lock); 4470 + mutex_lock(&q->elevator_lock); 4471 4471 for (i = 0; i < set->nr_hw_queues; i++) { 4472 4472 int old_node; 4473 4473 int node = blk_mq_get_hctx_node(set, i); ··· 4500 4500 4501 4501 xa_for_each_start(&q->hctx_table, j, hctx, j) 4502 4502 blk_mq_exit_hctx(q, set, hctx, j); 4503 - mutex_unlock(&q->sysfs_lock); 4503 + mutex_unlock(&q->elevator_lock); 4504 4504 4505 4505 /* unregister cpuhp callbacks for exited hctxs */ 4506 4506 blk_mq_remove_hw_queues_cpuhp(q); ··· 4933 4933 if (!qe) 4934 4934 return false; 4935 4935 4936 - /* q->elevator needs protection from ->sysfs_lock */ 4937 - mutex_lock(&q->sysfs_lock); 4936 + /* Accessing q->elevator needs protection from ->elevator_lock. */ 4937 + mutex_lock(&q->elevator_lock); 4938 4938 4939 - /* the check has to be done with holding sysfs_lock */ 4940 4939 if (!q->elevator) { 4941 4940 kfree(qe); 4942 4941 goto unlock; ··· 4949 4950 list_add(&qe->node, head); 4950 4951 elevator_disable(q); 4951 4952 unlock: 4952 - mutex_unlock(&q->sysfs_lock); 4953 + mutex_unlock(&q->elevator_lock); 4953 4954 4954 4955 return true; 4955 4956 } ··· 4979 4980 list_del(&qe->node); 4980 4981 kfree(qe); 4981 4982 4982 - mutex_lock(&q->sysfs_lock); 4983 + mutex_lock(&q->elevator_lock); 4983 4984 elevator_switch(q, t); 4984 4985 /* drop the reference acquired in blk_mq_elv_switch_none */ 4985 4986 elevator_put(t); 4986 - mutex_unlock(&q->sysfs_lock); 4987 + mutex_unlock(&q->elevator_lock); 4987 4988 } 4988 4989 4989 4990 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
+23 -11
block/blk-sysfs.c
··· 693 693 * Attributes which are protected with q->sysfs_lock. 694 694 */ 695 695 &queue_requests_entry.attr, 696 - &elv_iosched_entry.attr, 697 696 #ifdef CONFIG_BLK_WBT 698 697 &queue_wb_lat_entry.attr, 699 698 #endif 699 + /* 700 + * Attributes which require some form of locking other than 701 + * q->sysfs_lock. 702 + */ 703 + &elv_iosched_entry.attr, 704 + 700 705 /* 701 706 * Attributes which don't require locking. 702 707 */ ··· 870 865 if (ret) 871 866 goto out_debugfs_remove; 872 867 873 - if (q->elevator) { 874 - ret = elv_register_queue(q, false); 875 - if (ret) 876 - goto out_unregister_ia_ranges; 877 - } 878 - 879 868 ret = blk_crypto_sysfs_register(disk); 880 869 if (ret) 881 - goto out_elv_unregister; 870 + goto out_unregister_ia_ranges; 871 + 872 + mutex_lock(&q->elevator_lock); 873 + if (q->elevator) { 874 + ret = elv_register_queue(q, false); 875 + if (ret) { 876 + mutex_unlock(&q->elevator_lock); 877 + goto out_crypto_sysfs_unregister; 878 + } 879 + } 880 + mutex_unlock(&q->elevator_lock); 882 881 883 882 blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); 884 883 wbt_enable_default(disk); ··· 907 898 908 899 return ret; 909 900 910 - out_elv_unregister: 911 - elv_unregister_queue(q); 901 + out_crypto_sysfs_unregister: 902 + blk_crypto_sysfs_unregister(disk); 912 903 out_unregister_ia_ranges: 913 904 disk_unregister_independent_access_ranges(disk); 914 905 out_debugfs_remove: ··· 954 945 blk_mq_sysfs_unregister(disk); 955 946 blk_crypto_sysfs_unregister(disk); 956 947 957 - mutex_lock(&q->sysfs_lock); 948 + mutex_lock(&q->elevator_lock); 958 949 elv_unregister_queue(q); 950 + mutex_unlock(&q->elevator_lock); 951 + 952 + mutex_lock(&q->sysfs_lock); 959 953 disk_unregister_independent_access_ranges(disk); 960 954 mutex_unlock(&q->sysfs_lock); 961 955
+16 -19
block/elevator.c
··· 457 457 struct elevator_queue *e = q->elevator; 458 458 int error; 459 459 460 - lockdep_assert_held(&q->sysfs_lock); 460 + lockdep_assert_held(&q->elevator_lock); 461 461 462 462 error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); 463 463 if (!error) { ··· 481 481 { 482 482 struct elevator_queue *e = q->elevator; 483 483 484 - lockdep_assert_held(&q->sysfs_lock); 484 + lockdep_assert_held(&q->elevator_lock); 485 485 486 486 if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { 487 487 kobject_uevent(&e->kobj, KOBJ_REMOVE); ··· 618 618 unsigned int memflags; 619 619 int ret; 620 620 621 - lockdep_assert_held(&q->sysfs_lock); 621 + lockdep_assert_held(&q->elevator_lock); 622 622 623 623 memflags = blk_mq_freeze_queue(q); 624 624 blk_mq_quiesce_queue(q); ··· 655 655 { 656 656 unsigned int memflags; 657 657 658 - lockdep_assert_held(&q->sysfs_lock); 658 + lockdep_assert_held(&q->elevator_lock); 659 659 660 660 memflags = blk_mq_freeze_queue(q); 661 661 blk_mq_quiesce_queue(q); ··· 700 700 return ret; 701 701 } 702 702 703 - void elv_iosched_load_module(struct gendisk *disk, const char *buf, 704 - size_t count) 703 + static void elv_iosched_load_module(char *elevator_name) 705 704 { 706 - char elevator_name[ELV_NAME_MAX]; 707 705 struct elevator_type *found; 708 - const char *name; 709 - 710 - strscpy(elevator_name, buf, sizeof(elevator_name)); 711 - name = strstrip(elevator_name); 712 706 713 707 spin_lock(&elv_list_lock); 714 - found = __elevator_find(name); 708 + found = __elevator_find(elevator_name); 715 709 spin_unlock(&elv_list_lock); 716 710 717 711 if (!found) 718 - request_module("%s-iosched", name); 712 + request_module("%s-iosched", elevator_name); 719 713 } 720 714 721 715 ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, 722 716 size_t count) 723 717 { 724 718 char elevator_name[ELV_NAME_MAX]; 719 + char *name; 725 720 int ret; 726 721 unsigned int memflags; 727 722 struct request_queue *q = disk->queue; ··· 726 731 * queue to ensure that the module file can be read when the request 727 732 * queue is the one for the device storing the module file. 728 733 */ 729 - elv_iosched_load_module(disk, buf, count); 730 734 strscpy(elevator_name, buf, sizeof(elevator_name)); 735 + name = strstrip(elevator_name); 731 736 732 - mutex_lock(&q->sysfs_lock); 737 + elv_iosched_load_module(name); 738 + 733 739 memflags = blk_mq_freeze_queue(q); 734 - ret = elevator_change(q, strstrip(elevator_name)); 740 + mutex_lock(&q->elevator_lock); 741 + ret = elevator_change(q, name); 735 742 if (!ret) 736 743 ret = count; 744 + mutex_unlock(&q->elevator_lock); 737 745 blk_mq_unfreeze_queue(q, memflags); 738 - mutex_unlock(&q->sysfs_lock); 739 746 return ret; 740 747 } 741 748 ··· 748 751 struct elevator_type *cur = NULL, *e; 749 752 int len = 0; 750 753 751 - mutex_lock(&q->sysfs_lock); 754 + mutex_lock(&q->elevator_lock); 752 755 if (!q->elevator) { 753 756 len += sprintf(name+len, "[none] "); 754 757 } else { ··· 766 769 spin_unlock(&elv_list_lock); 767 770 768 771 len += sprintf(name+len, "\n"); 769 - mutex_unlock(&q->sysfs_lock); 772 + mutex_unlock(&q->elevator_lock); 770 773 771 774 return len; 772 775 }
-2
block/elevator.h
··· 148 148 * io scheduler sysfs switching 149 149 */ 150 150 ssize_t elv_iosched_show(struct gendisk *disk, char *page); 151 - void elv_iosched_load_module(struct gendisk *disk, const char *page, 152 - size_t count); 153 151 ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); 154 152 155 153 extern bool elv_bio_merge_ok(struct request *, struct bio *);
+6 -3
block/genhd.c
··· 565 565 if (disk->major == BLOCK_EXT_MAJOR) 566 566 blk_free_ext_minor(disk->first_minor); 567 567 out_exit_elevator: 568 - if (disk->queue->elevator) 568 + if (disk->queue->elevator) { 569 + mutex_lock(&disk->queue->elevator_lock); 569 570 elevator_exit(disk->queue); 571 + mutex_unlock(&disk->queue->elevator_lock); 572 + } 570 573 return ret; 571 574 } 572 575 EXPORT_SYMBOL_GPL(add_disk_fwnode); ··· 745 742 746 743 blk_mq_quiesce_queue(q); 747 744 if (q->elevator) { 748 - mutex_lock(&q->sysfs_lock); 745 + mutex_lock(&q->elevator_lock); 749 746 elevator_exit(q); 750 - mutex_unlock(&q->sysfs_lock); 747 + mutex_unlock(&q->elevator_lock); 751 748 } 752 749 rq_qos_exit(q); 753 750 blk_mq_unquiesce_queue(q);
+8
include/linux/blkdev.h
··· 560 560 struct blk_flush_queue *fq; 561 561 struct list_head flush_list; 562 562 563 + /* 564 + * Protects against I/O scheduler switching, specifically when 565 + * updating q->elevator. To ensure proper locking order during 566 + * an elevator update, first freeze the queue, then acquire 567 + * ->elevator_lock. 568 + */ 569 + struct mutex elevator_lock; 570 + 563 571 struct mutex sysfs_lock; 564 572 struct mutex limits_lock; 565 573