Merge tag 'block-6.7-2023-12-08' of git://git.kernel.dk/linux

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'block-6.7-2023-12-08' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:
"Nothing major in here, just miscellanous fixes for MD and NVMe:

- NVMe pull request via Keith:
- Proper nvme ctrl state setting (Keith)
- Passthrough command optimization (Keith)
- Spectre fix (Nitesh)
- Kconfig clarifications (Shin'ichiro)
- Frozen state deadlock fix (Bitao)
- Power setting quirk (Georg)

- MD pull requests via Song:
- 6.7 regresisons with recovery/sync (Yu)
- Reshape fix (David)"

* tag 'block-6.7-2023-12-08' of git://git.kernel.dk/linux:
md: split MD_RECOVERY_NEEDED out of mddev_resume
nvme-pci: Add sleep quirk for Kingston drives
md: fix stopping sync thread
md: don't leave 'MD_RECOVERY_FROZEN' in error path of md_set_readonly()
md: fix missing flush of sync_work
nvme: fix deadlock between reset and scan
nvme: prevent potential spectre v1 gadget
nvme: improve NVME_HOST_AUTH and NVME_TARGET_AUTH config descriptions
nvme-ioctl: move capable() admin check to the end
nvme: ensure reset state check ordering
nvme: introduce helper function to get ctrl state
md/raid6: use valid sector values to determine if an I/O should wait on the reshape

Linus Torvalds 2 years ago d71369db 689659c9

+197 -134

12 changed files

expand all collapse all

drivers

md.c

raid5.c

nvme

host

Kconfig

core.c

fc.c

ioctl.c

nvme.h

pci.c

rdma.c

tcp.c

target

Kconfig

configfs.c

+76 -68

drivers/md/md.c

reviewed

··· 490 490 } 491 491 EXPORT_SYMBOL_GPL(mddev_suspend); 492 492 493 493 - void mddev_resume(struct mddev *mddev) 493 493 + static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 494 494 { 495 495 lockdep_assert_not_held(&mddev->reconfig_mutex); 496 496 ··· 507 507 percpu_ref_resurrect(&mddev->active_io); 508 508 wake_up(&mddev->sb_wait); 509 509 510 510 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 510 510 + if (recovery_needed) 511 511 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 511 512 md_wakeup_thread(mddev->thread); 512 513 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 513 514 514 515 mutex_unlock(&mddev->suspend_mutex); 516 516 + } 517 517 + 518 518 + void mddev_resume(struct mddev *mddev) 519 519 + { 520 520 + return __mddev_resume(mddev, true); 515 521 } 516 522 EXPORT_SYMBOL_GPL(mddev_resume); 517 523 ··· 4846 4840 return sprintf(page, "%s\n", type); 4847 4841 } 4848 4842 4849 4849 - static void stop_sync_thread(struct mddev *mddev) 4843 4843 + /** 4844 4844 + * stop_sync_thread() - wait for sync_thread to stop if it's running. 4845 4845 + * @mddev: the array. 4846 4846 + * @locked: if set, reconfig_mutex will still be held after this function 4847 4847 + * return; if not set, reconfig_mutex will be released after this 4848 4848 + * function return. 4849 4849 + * @check_seq: if set, only wait for curent running sync_thread to stop, noted 4850 4850 + * that new sync_thread can still start. 4851 4851 + */ 4852 4852 + static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) 4850 4853 { 4851 4851 - if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4852 4852 - return; 4854 4854 + int sync_seq; 4853 4855 4854 4854 - if (mddev_lock(mddev)) 4855 4855 - return; 4856 4856 + if (check_seq) 4857 4857 + sync_seq = atomic_read(&mddev->sync_seq); 4856 4858 4857 4857 - /* 4858 4858 - * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4859 4859 - * held. 4860 4860 - */ 4861 4859 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4862 4862 - mddev_unlock(mddev); 4860 4860 + if (!locked) 4861 4861 + mddev_unlock(mddev); 4863 4862 return; 4864 4863 } 4865 4864 4866 4866 - if (work_pending(&mddev->del_work)) 4867 4867 - flush_workqueue(md_misc_wq); 4865 4865 + mddev_unlock(mddev); 4868 4866 4869 4867 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4870 4868 /* ··· 4876 4866 * never happen 4877 4867 */ 4878 4868 md_wakeup_thread_directly(mddev->sync_thread); 4869 4869 + if (work_pending(&mddev->sync_work)) 4870 4870 + flush_work(&mddev->sync_work); 4879 4871 4880 4880 - mddev_unlock(mddev); 4872 4872 + wait_event(resync_wait, 4873 4873 + !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4874 4874 + (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); 4875 4875 + 4876 4876 + if (locked) 4877 4877 + mddev_lock_nointr(mddev); 4881 4878 } 4882 4879 4883 4880 static void idle_sync_thread(struct mddev *mddev) 4884 4881 { 4885 4885 - int sync_seq = atomic_read(&mddev->sync_seq); 4886 4886 - 4887 4882 mutex_lock(&mddev->sync_mutex); 4888 4883 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4889 4889 - stop_sync_thread(mddev); 4890 4884 4891 4891 - wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4892 4892 - !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4885 4885 + if (mddev_lock(mddev)) { 4886 4886 + mutex_unlock(&mddev->sync_mutex); 4887 4887 + return; 4888 4888 + } 4893 4889 4890 4890 + stop_sync_thread(mddev, false, true); 4894 4891 mutex_unlock(&mddev->sync_mutex); 4895 4892 } 4896 4893 ··· 4905 4888 { 4906 4889 mutex_lock(&mddev->sync_mutex); 4907 4890 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4908 4908 - stop_sync_thread(mddev); 4909 4891 4910 4910 - wait_event(resync_wait, mddev->sync_thread == NULL && 4911 4911 - !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4892 4892 + if (mddev_lock(mddev)) { 4893 4893 + mutex_unlock(&mddev->sync_mutex); 4894 4894 + return; 4895 4895 + } 4912 4896 4897 4897 + stop_sync_thread(mddev, false, false); 4913 4898 mutex_unlock(&mddev->sync_mutex); 4914 4899 } 4915 4900 ··· 6283 6264 6284 6265 static void __md_stop_writes(struct mddev *mddev) 6285 6266 { 6286 6286 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6287 6287 - if (work_pending(&mddev->del_work)) 6288 6288 - flush_workqueue(md_misc_wq); 6289 6289 - if (mddev->sync_thread) { 6290 6290 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6291 6291 - md_reap_sync_thread(mddev); 6292 6292 - } 6293 6293 - 6267 6267 + stop_sync_thread(mddev, true, false); 6294 6268 del_timer_sync(&mddev->safemode_timer); 6295 6269 6296 6270 if (mddev->pers && mddev->pers->quiesce) { ··· 6367 6355 int err = 0; 6368 6356 int did_freeze = 0; 6369 6357 6358 6358 + if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6359 6359 + return -EBUSY; 6360 6360 + 6370 6361 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6371 6362 did_freeze = 1; 6372 6363 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6373 6364 md_wakeup_thread(mddev->thread); 6374 6365 } 6375 6375 - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6376 6376 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6377 6366 6378 6378 - /* 6379 6379 - * Thread might be blocked waiting for metadata update which will now 6380 6380 - * never happen 6381 6381 - */ 6382 6382 - md_wakeup_thread_directly(mddev->sync_thread); 6383 6383 - 6384 6384 - if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6385 6385 - return -EBUSY; 6386 6386 - mddev_unlock(mddev); 6387 6387 - wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6388 6388 - &mddev->recovery)); 6367 6367 + stop_sync_thread(mddev, false, false); 6389 6368 wait_event(mddev->sb_wait, 6390 6369 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6391 6370 mddev_lock_nointr(mddev); ··· 6386 6383 mddev->sync_thread || 6387 6384 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6388 6385 pr_warn("md: %s still in use.\n",mdname(mddev)); 6389 6389 - if (did_freeze) { 6390 6390 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6391 6391 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6392 6392 - md_wakeup_thread(mddev->thread); 6393 6393 - } 6394 6386 err = -EBUSY; 6395 6387 goto out; 6396 6388 } 6389 6389 + 6397 6390 if (mddev->pers) { 6398 6391 __md_stop_writes(mddev); 6399 6392 6400 6400 - err = -ENXIO; 6401 6401 - if (mddev->ro == MD_RDONLY) 6393 6393 + if (mddev->ro == MD_RDONLY) { 6394 6394 + err = -ENXIO; 6402 6395 goto out; 6396 6396 + } 6397 6397 + 6403 6398 mddev->ro = MD_RDONLY; 6404 6399 set_disk_ro(mddev->gendisk, 1); 6400 6400 + } 6401 6401 + 6402 6402 + out: 6403 6403 + if ((mddev->pers && !err) || did_freeze) { 6405 6404 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6406 6405 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6407 6406 md_wakeup_thread(mddev->thread); 6408 6407 sysfs_notify_dirent_safe(mddev->sysfs_state); 6409 6409 - err = 0; 6410 6408 } 6411 6411 - out: 6409 6409 + 6412 6410 mutex_unlock(&mddev->open_mutex); 6413 6411 return err; 6414 6412 } ··· 6430 6426 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6431 6427 md_wakeup_thread(mddev->thread); 6432 6428 } 6433 6433 - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6434 6434 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6435 6429 6436 6436 - /* 6437 6437 - * Thread might be blocked waiting for metadata update which will now 6438 6438 - * never happen 6439 6439 - */ 6440 6440 - md_wakeup_thread_directly(mddev->sync_thread); 6441 6441 - 6442 6442 - mddev_unlock(mddev); 6443 6443 - wait_event(resync_wait, (mddev->sync_thread == NULL && 6444 6444 - !test_bit(MD_RECOVERY_RUNNING, 6445 6445 - &mddev->recovery))); 6446 6446 - mddev_lock_nointr(mddev); 6430 6430 + stop_sync_thread(mddev, true, false); 6447 6431 6448 6432 mutex_lock(&mddev->open_mutex); 6449 6433 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || ··· 9395 9403 goto not_running; 9396 9404 } 9397 9405 9398 9398 - suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 9406 9406 + mddev_unlock(mddev); 9407 9407 + /* 9408 9408 + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9409 9409 + * not set it again. Otherwise, we may cause issue like this one: 9410 9410 + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9411 9411 + * Therefore, use __mddev_resume(mddev, false). 9412 9412 + */ 9413 9413 + if (suspend) 9414 9414 + __mddev_resume(mddev, false); 9399 9415 md_wakeup_thread(mddev->sync_thread); 9400 9416 sysfs_notify_dirent_safe(mddev->sysfs_action); 9401 9417 md_new_event(); ··· 9415 9415 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9416 9416 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9417 9417 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9418 9418 - suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 9418 9418 + mddev_unlock(mddev); 9419 9419 + /* 9420 9420 + * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9421 9421 + * not set it again. Otherwise, we may cause issue like this one: 9422 9422 + * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9423 9423 + * Therefore, use __mddev_resume(mddev, false). 9424 9424 + */ 9425 9425 + if (suspend) 9426 9426 + __mddev_resume(mddev, false); 9419 9427 9420 9428 wake_up(&resync_wait); 9421 9429 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&

+2 -2

drivers/md/raid5.c

reviewed

··· 5892 5892 int dd_idx; 5893 5893 5894 5894 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5895 5895 - if (dd_idx == sh->pd_idx) 5895 5895 + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 5896 5896 continue; 5897 5897 5898 5898 min_sector = min(min_sector, sh->dev[dd_idx].sector); 5899 5899 - max_sector = min(max_sector, sh->dev[dd_idx].sector); 5899 5899 + max_sector = max(max_sector, sh->dev[dd_idx].sector); 5900 5900 } 5901 5901 5902 5902 spin_lock_irq(&conf->device_lock);

+3 -2

drivers/nvme/host/Kconfig

reviewed

··· 107 107 If unsure, say N. 108 108 109 109 config NVME_HOST_AUTH 110 110 - bool "NVM Express over Fabrics In-Band Authentication" 110 110 + bool "NVMe over Fabrics In-Band Authentication in host side" 111 111 depends on NVME_CORE 112 112 select NVME_AUTH 113 113 help 114 114 - This provides support for NVMe over Fabrics In-Band Authentication. 114 114 + This provides support for NVMe over Fabrics In-Band Authentication in 115 115 + host side. 115 116 116 117 If unsure, say N. 117 118

+32 -20

drivers/nvme/host/core.c

reviewed

··· 131 131 /* 132 132 * Only new queue scan work when admin and IO queues are both alive 133 133 */ 134 134 - if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) 134 134 + if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset) 135 135 queue_work(nvme_wq, &ctrl->scan_work); 136 136 } 137 137 ··· 143 143 */ 144 144 int nvme_try_sched_reset(struct nvme_ctrl *ctrl) 145 145 { 146 146 - if (ctrl->state != NVME_CTRL_RESETTING) 146 146 + if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING) 147 147 return -EBUSY; 148 148 if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 149 149 return -EBUSY; ··· 156 156 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 157 157 struct nvme_ctrl, failfast_work); 158 158 159 159 - if (ctrl->state != NVME_CTRL_CONNECTING) 159 159 + if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING) 160 160 return; 161 161 162 162 set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); ··· 200 200 ret = nvme_reset_ctrl(ctrl); 201 201 if (!ret) { 202 202 flush_work(&ctrl->reset_work); 203 203 - if (ctrl->state != NVME_CTRL_LIVE) 203 203 + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) 204 204 ret = -ENETRESET; 205 205 } 206 206 ··· 499 499 500 500 spin_lock_irqsave(&ctrl->lock, flags); 501 501 502 502 - old_state = ctrl->state; 502 502 + old_state = nvme_ctrl_state(ctrl); 503 503 switch (new_state) { 504 504 case NVME_CTRL_LIVE: 505 505 switch (old_state) { ··· 567 567 } 568 568 569 569 if (changed) { 570 570 - ctrl->state = new_state; 570 570 + WRITE_ONCE(ctrl->state, new_state); 571 571 wake_up_all(&ctrl->state_wq); 572 572 } 573 573 ··· 575 575 if (!changed) 576 576 return false; 577 577 578 578 - if (ctrl->state == NVME_CTRL_LIVE) { 578 578 + if (new_state == NVME_CTRL_LIVE) { 579 579 if (old_state == NVME_CTRL_CONNECTING) 580 580 nvme_stop_failfast_work(ctrl); 581 581 nvme_kick_requeue_lists(ctrl); 582 582 - } else if (ctrl->state == NVME_CTRL_CONNECTING && 582 582 + } else if (new_state == NVME_CTRL_CONNECTING && 583 583 old_state == NVME_CTRL_RESETTING) { 584 584 nvme_start_failfast_work(ctrl); 585 585 } ··· 592 592 */ 593 593 static bool nvme_state_terminal(struct nvme_ctrl *ctrl) 594 594 { 595 595 - switch (ctrl->state) { 595 595 + switch (nvme_ctrl_state(ctrl)) { 596 596 case NVME_CTRL_NEW: 597 597 case NVME_CTRL_LIVE: 598 598 case NVME_CTRL_RESETTING: ··· 617 617 wait_event(ctrl->state_wq, 618 618 nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || 619 619 nvme_state_terminal(ctrl)); 620 620 - return ctrl->state == NVME_CTRL_RESETTING; 620 620 + return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING; 621 621 } 622 622 EXPORT_SYMBOL_GPL(nvme_wait_reset); 623 623 ··· 704 704 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, 705 705 struct request *rq) 706 706 { 707 707 - if (ctrl->state != NVME_CTRL_DELETING_NOIO && 708 708 - ctrl->state != NVME_CTRL_DELETING && 709 709 - ctrl->state != NVME_CTRL_DEAD && 707 707 + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 708 708 + 709 709 + if (state != NVME_CTRL_DELETING_NOIO && 710 710 + state != NVME_CTRL_DELETING && 711 711 + state != NVME_CTRL_DEAD && 710 712 !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && 711 713 !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) 712 714 return BLK_STS_RESOURCE; ··· 738 736 * command, which is require to set the queue live in the 739 737 * appropinquate states. 740 738 */ 741 741 - switch (ctrl->state) { 739 739 + switch (nvme_ctrl_state(ctrl)) { 742 740 case NVME_CTRL_CONNECTING: 743 741 if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && 744 742 (req->cmd->fabrics.fctype == nvme_fabrics_type_connect || ··· 2552 2550 2553 2551 if (ctrl->ps_max_latency_us != latency) { 2554 2552 ctrl->ps_max_latency_us = latency; 2555 2555 - if (ctrl->state == NVME_CTRL_LIVE) 2553 2553 + if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE) 2556 2554 nvme_configure_apst(ctrl); 2557 2555 } 2558 2556 } ··· 3240 3238 struct nvme_ctrl *ctrl = 3241 3239 container_of(inode->i_cdev, struct nvme_ctrl, cdev); 3242 3240 3243 3243 - switch (ctrl->state) { 3241 3241 + switch (nvme_ctrl_state(ctrl)) { 3244 3242 case NVME_CTRL_LIVE: 3245 3243 break; 3246 3244 default: ··· 3662 3660 goto out_unlink_ns; 3663 3661 3664 3662 down_write(&ctrl->namespaces_rwsem); 3663 3663 + /* 3664 3664 + * Ensure that no namespaces are added to the ctrl list after the queues 3665 3665 + * are frozen, thereby avoiding a deadlock between scan and reset. 3666 3666 + */ 3667 3667 + if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) { 3668 3668 + up_write(&ctrl->namespaces_rwsem); 3669 3669 + goto out_unlink_ns; 3670 3670 + } 3665 3671 nvme_ns_add_to_ctrl_list(ns); 3666 3672 up_write(&ctrl->namespaces_rwsem); 3667 3673 nvme_get_ctrl(ctrl); ··· 3934 3924 int ret; 3935 3925 3936 3926 /* No tagset on a live ctrl means IO queues could not created */ 3937 3937 - if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) 3927 3927 + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset) 3938 3928 return; 3939 3929 3940 3930 /* ··· 4004 3994 * removing the namespaces' disks; fail all the queues now to avoid 4005 3995 * potentially having to clean up the failed sync later. 4006 3996 */ 4007 4007 - if (ctrl->state == NVME_CTRL_DEAD) 3997 3997 + if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD) 4008 3998 nvme_mark_namespaces_dead(ctrl); 4009 3999 4010 4000 /* this is a no-op when called from the controller reset handler */ ··· 4086 4076 * flushing ctrl async_event_work after changing the controller state 4087 4077 * from LIVE and before freeing the admin queue. 4088 4078 */ 4089 4089 - if (ctrl->state == NVME_CTRL_LIVE) 4079 4079 + if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE) 4090 4080 ctrl->ops->submit_async_event(ctrl); 4091 4081 } 4092 4082 ··· 4481 4471 { 4482 4472 int ret; 4483 4473 4484 4484 - ctrl->state = NVME_CTRL_NEW; 4474 4474 + WRITE_ONCE(ctrl->state, NVME_CTRL_NEW); 4485 4475 clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); 4486 4476 spin_lock_init(&ctrl->lock); 4487 4477 mutex_init(&ctrl->scan_lock); ··· 4591 4581 list_for_each_entry(ns, &ctrl->namespaces, list) 4592 4582 blk_mq_unfreeze_queue(ns->queue); 4593 4583 up_read(&ctrl->namespaces_rwsem); 4584 4584 + clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); 4594 4585 } 4595 4586 EXPORT_SYMBOL_GPL(nvme_unfreeze); 4596 4587 ··· 4625 4614 { 4626 4615 struct nvme_ns *ns; 4627 4616 4617 4617 + set_bit(NVME_CTRL_FROZEN, &ctrl->flags); 4628 4618 down_read(&ctrl->namespaces_rwsem); 4629 4619 list_for_each_entry(ns, &ctrl->namespaces, list) 4630 4620 blk_freeze_queue_start(ns->queue);

+3 -3

drivers/nvme/host/fc.c

reviewed

··· 557 557 static void 558 558 nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) 559 559 { 560 560 - switch (ctrl->ctrl.state) { 560 560 + switch (nvme_ctrl_state(&ctrl->ctrl)) { 561 561 case NVME_CTRL_NEW: 562 562 case NVME_CTRL_CONNECTING: 563 563 /* ··· 793 793 "NVME-FC{%d}: controller connectivity lost. Awaiting " 794 794 "Reconnect", ctrl->cnum); 795 795 796 796 - switch (ctrl->ctrl.state) { 796 796 + switch (nvme_ctrl_state(&ctrl->ctrl)) { 797 797 case NVME_CTRL_NEW: 798 798 case NVME_CTRL_LIVE: 799 799 /* ··· 3319 3319 unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; 3320 3320 bool recon = true; 3321 3321 3322 3322 - if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) 3322 3322 + if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_CONNECTING) 3323 3323 return; 3324 3324 3325 3325 if (portptr->port_state == FC_OBJSTATE_ONLINE) {

+11 -10

drivers/nvme/host/ioctl.c

reviewed

··· 18 18 { 19 19 u32 effects; 20 20 21 21 - if (capable(CAP_SYS_ADMIN)) 22 22 - return true; 23 23 - 24 21 /* 25 22 * Do not allow unprivileged passthrough on partitions, as that allows an 26 23 * escape from the containment of the partition. 27 24 */ 28 25 if (flags & NVME_IOCTL_PARTITION) 29 29 - return false; 26 26 + goto admin; 30 27 31 28 /* 32 29 * Do not allow unprivileged processes to send vendor specific or fabrics ··· 31 34 */ 32 35 if (c->common.opcode >= nvme_cmd_vendor_start || 33 36 c->common.opcode == nvme_fabrics_command) 34 34 - return false; 37 37 + goto admin; 35 38 36 39 /* 37 40 * Do not allow unprivileged passthrough of admin commands except ··· 50 53 return true; 51 54 } 52 55 } 53 53 - return false; 56 56 + goto admin; 54 57 } 55 58 56 59 /* ··· 60 63 */ 61 64 effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); 62 65 if (!(effects & NVME_CMD_EFFECTS_CSUPP)) 63 63 - return false; 66 66 + goto admin; 64 67 65 68 /* 66 69 * Don't allow passthrough for command that have intrusive (or unknown) ··· 69 72 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 70 73 NVME_CMD_EFFECTS_UUID_SEL | 71 74 NVME_CMD_EFFECTS_SCOPE_MASK)) 72 72 - return false; 75 75 + goto admin; 73 76 74 77 /* 75 78 * Only allow I/O commands that transfer data to the controller or that 76 79 * change the logical block contents if the file descriptor is open for 77 80 * writing. 78 81 */ 79 79 - if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) 80 80 - return open_for_write; 82 82 + if ((nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) && 83 83 + !open_for_write) 84 84 + goto admin; 85 85 + 81 86 return true; 87 87 + admin: 88 88 + return capable(CAP_SYS_ADMIN); 82 89 } 83 90 84 91 /*

+11

drivers/nvme/host/nvme.h

reviewed

··· 156 156 * No temperature thresholds for channels other than 0 (Composite). 157 157 */ 158 158 NVME_QUIRK_NO_SECONDARY_TEMP_THRESH = (1 << 19), 159 159 + 160 160 + /* 161 161 + * Disables simple suspend/resume path. 162 162 + */ 163 163 + NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND = (1 << 20), 159 164 }; 160 165 161 166 /* ··· 256 251 NVME_CTRL_STOPPED = 3, 257 252 NVME_CTRL_SKIP_ID_CNS_CS = 4, 258 253 NVME_CTRL_DIRTY_CAPABILITY = 5, 254 254 + NVME_CTRL_FROZEN = 6, 259 255 }; 260 256 261 257 struct nvme_ctrl { ··· 392 386 enum nvme_ctrl_type cntrltype; 393 387 enum nvme_dctype dctype; 394 388 }; 389 389 + 390 390 + static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) 391 391 + { 392 392 + return READ_ONCE(ctrl->state); 393 393 + } 395 394 396 395 enum nvme_iopolicy { 397 396 NVME_IOPOLICY_NUMA,

+22 -8

drivers/nvme/host/pci.c

reviewed

··· 1233 1233 bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); 1234 1234 1235 1235 /* If there is a reset/reinit ongoing, we shouldn't reset again. */ 1236 1236 - switch (dev->ctrl.state) { 1236 1236 + switch (nvme_ctrl_state(&dev->ctrl)) { 1237 1237 case NVME_CTRL_RESETTING: 1238 1238 case NVME_CTRL_CONNECTING: 1239 1239 return false; ··· 1321 1321 * cancellation error. All outstanding requests are completed on 1322 1322 * shutdown, so we return BLK_EH_DONE. 1323 1323 */ 1324 1324 - switch (dev->ctrl.state) { 1324 1324 + switch (nvme_ctrl_state(&dev->ctrl)) { 1325 1325 case NVME_CTRL_CONNECTING: 1326 1326 nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 1327 1327 fallthrough; ··· 1593 1593 /* 1594 1594 * Controller is in wrong state, fail early. 1595 1595 */ 1596 1596 - if (dev->ctrl.state != NVME_CTRL_CONNECTING) { 1596 1596 + if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_CONNECTING) { 1597 1597 mutex_unlock(&dev->shutdown_lock); 1598 1598 return -ENODEV; 1599 1599 } ··· 2573 2573 2574 2574 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 2575 2575 { 2576 2576 + enum nvme_ctrl_state state = nvme_ctrl_state(&dev->ctrl); 2576 2577 struct pci_dev *pdev = to_pci_dev(dev->dev); 2577 2578 bool dead; 2578 2579 2579 2580 mutex_lock(&dev->shutdown_lock); 2580 2581 dead = nvme_pci_ctrl_is_dead(dev); 2581 2581 - if (dev->ctrl.state == NVME_CTRL_LIVE || 2582 2582 - dev->ctrl.state == NVME_CTRL_RESETTING) { 2582 2582 + if (state == NVME_CTRL_LIVE || state == NVME_CTRL_RESETTING) { 2583 2583 if (pci_is_enabled(pdev)) 2584 2584 nvme_start_freeze(&dev->ctrl); 2585 2585 /* ··· 2690 2690 bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); 2691 2691 int result; 2692 2692 2693 2693 - if (dev->ctrl.state != NVME_CTRL_RESETTING) { 2693 2693 + if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) { 2694 2694 dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", 2695 2695 dev->ctrl.state); 2696 2696 result = -ENODEV; ··· 2902 2902 if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && 2903 2903 dmi_match(DMI_BOARD_NAME, "LNVNB161216")) 2904 2904 return NVME_QUIRK_SIMPLE_SUSPEND; 2905 2905 + } else if (pdev->vendor == 0x2646 && (pdev->device == 0x2263 || 2906 2906 + pdev->device == 0x500f)) { 2907 2907 + /* 2908 2908 + * Exclude some Kingston NV1 and A2000 devices from 2909 2909 + * NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a 2910 2910 + * lot fo energy with s2idle sleep on some TUXEDO platforms. 2911 2911 + */ 2912 2912 + if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || 2913 2913 + dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") || 2914 2914 + dmi_match(DMI_BOARD_NAME, "NS5x_7xPU") || 2915 2915 + dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1")) 2916 2916 + return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; 2905 2917 } 2906 2918 2907 2919 return 0; ··· 2944 2932 dev->dev = get_device(&pdev->dev); 2945 2933 2946 2934 quirks |= check_vendor_combination_bug(pdev); 2947 2947 - if (!noacpi && acpi_storage_d3(&pdev->dev)) { 2935 2935 + if (!noacpi && 2936 2936 + !(quirks & NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND) && 2937 2937 + acpi_storage_d3(&pdev->dev)) { 2948 2938 /* 2949 2939 * Some systems use a bios work around to ask for D3 on 2950 2940 * platforms that support kernel managed suspend. ··· 3206 3192 nvme_wait_freeze(ctrl); 3207 3193 nvme_sync_queues(ctrl); 3208 3194 3209 3209 - if (ctrl->state != NVME_CTRL_LIVE) 3195 3195 + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) 3210 3196 goto unfreeze; 3211 3197 3212 3198 /*

+14 -9

drivers/nvme/host/rdma.c

reviewed

··· 984 984 985 985 static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) 986 986 { 987 987 + enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); 988 988 + 987 989 /* If we are resetting/deleting then do nothing */ 988 988 - if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) { 989 989 - WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || 990 990 - ctrl->ctrl.state == NVME_CTRL_LIVE); 990 990 + if (state != NVME_CTRL_CONNECTING) { 991 991 + WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE); 991 992 return; 992 993 } 993 994 ··· 1060 1059 * unless we're during creation of a new controller to 1061 1060 * avoid races with teardown flow. 1062 1061 */ 1063 1063 - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && 1064 1064 - ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); 1062 1062 + enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); 1063 1063 + 1064 1064 + WARN_ON_ONCE(state != NVME_CTRL_DELETING && 1065 1065 + state != NVME_CTRL_DELETING_NOIO); 1065 1066 WARN_ON_ONCE(new); 1066 1067 ret = -EINVAL; 1067 1068 goto destroy_io; ··· 1132 1129 1133 1130 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 1134 1131 /* state change failure is ok if we started ctrl delete */ 1135 1135 - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING && 1136 1136 - ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO); 1132 1132 + enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); 1133 1133 + 1134 1134 + WARN_ON_ONCE(state != NVME_CTRL_DELETING && 1135 1135 + state != NVME_CTRL_DELETING_NOIO); 1137 1136 return; 1138 1137 } 1139 1138 ··· 1167 1162 struct nvme_rdma_queue *queue = wc->qp->qp_context; 1168 1163 struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1169 1164 1170 1170 - if (ctrl->ctrl.state == NVME_CTRL_LIVE) 1165 1165 + if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE) 1171 1166 dev_info(ctrl->ctrl.device, 1172 1167 "%s for CQE 0x%p failed with status %s (%d)\n", 1173 1168 op, wc->wr_cqe, ··· 1950 1945 dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", 1951 1946 rq->tag, nvme_rdma_queue_idx(queue)); 1952 1947 1953 1953 - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { 1948 1948 + if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) { 1954 1949 /* 1955 1950 * If we are resetting, connecting or deleting we should 1956 1951 * complete immediately because we may block controller

+17 -10

drivers/nvme/host/tcp.c

reviewed

··· 2152 2152 2153 2153 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) 2154 2154 { 2155 2155 + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2156 2156 + 2155 2157 /* If we are resetting/deleting then do nothing */ 2156 2156 - if (ctrl->state != NVME_CTRL_CONNECTING) { 2157 2157 - WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || 2158 2158 - ctrl->state == NVME_CTRL_LIVE); 2158 2158 + if (state != NVME_CTRL_CONNECTING) { 2159 2159 + WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE); 2159 2160 return; 2160 2161 } 2161 2162 ··· 2216 2215 * unless we're during creation of a new controller to 2217 2216 * avoid races with teardown flow. 2218 2217 */ 2219 2219 - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && 2220 2220 - ctrl->state != NVME_CTRL_DELETING_NOIO); 2218 2218 + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2219 2219 + 2220 2220 + WARN_ON_ONCE(state != NVME_CTRL_DELETING && 2221 2221 + state != NVME_CTRL_DELETING_NOIO); 2221 2222 WARN_ON_ONCE(new); 2222 2223 ret = -EINVAL; 2223 2224 goto destroy_io; ··· 2283 2280 2284 2281 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 2285 2282 /* state change failure is ok if we started ctrl delete */ 2286 2286 - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && 2287 2287 - ctrl->state != NVME_CTRL_DELETING_NOIO); 2283 2283 + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2284 2284 + 2285 2285 + WARN_ON_ONCE(state != NVME_CTRL_DELETING && 2286 2286 + state != NVME_CTRL_DELETING_NOIO); 2288 2287 return; 2289 2288 } 2290 2289 ··· 2316 2311 2317 2312 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 2318 2313 /* state change failure is ok if we started ctrl delete */ 2319 2319 - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && 2320 2320 - ctrl->state != NVME_CTRL_DELETING_NOIO); 2314 2314 + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); 2315 2315 + 2316 2316 + WARN_ON_ONCE(state != NVME_CTRL_DELETING && 2317 2317 + state != NVME_CTRL_DELETING_NOIO); 2321 2318 return; 2322 2319 } 2323 2320 ··· 2437 2430 nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type, 2438 2431 opc, nvme_opcode_str(qid, opc, fctype)); 2439 2432 2440 2440 - if (ctrl->state != NVME_CTRL_LIVE) { 2433 2433 + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) { 2441 2434 /* 2442 2435 * If we are resetting, connecting or deleting we should 2443 2436 * complete immediately because we may block controller

+3 -2

drivers/nvme/target/Kconfig

reviewed

··· 99 99 If unsure, say N. 100 100 101 101 config NVME_TARGET_AUTH 102 102 - bool "NVMe over Fabrics In-band Authentication support" 102 102 + bool "NVMe over Fabrics In-band Authentication in target side" 103 103 depends on NVME_TARGET 104 104 select NVME_AUTH 105 105 help 106 106 - This enables support for NVMe over Fabrics In-band Authentication 106 106 + This enables support for NVMe over Fabrics In-band Authentication in 107 107 + target side. 107 108 108 109 If unsure, say N.

drivers/nvme/target/configfs.c

reviewed

··· 18 18 #include <linux/nvme-keyring.h> 19 19 #include <crypto/hash.h> 20 20 #include <crypto/kpp.h> 21 21 + #include <linux/nospec.h> 21 22 22 23 #include "nvmet.h" 23 24 ··· 622 621 623 622 down_write(&nvmet_ana_sem); 624 623 oldgrpid = ns->anagrpid; 624 624 + newgrpid = array_index_nospec(newgrpid, NVMET_MAX_ANAGRPS); 625 625 nvmet_ana_group_enabled[newgrpid]++; 626 626 ns->anagrpid = newgrpid; 627 627 nvmet_ana_group_enabled[oldgrpid]--; ··· 1814 1812 grp->grpid = grpid; 1815 1813 1816 1814 down_write(&nvmet_ana_sem); 1815 1815 + grpid = array_index_nospec(grpid, NVMET_MAX_ANAGRPS); 1817 1816 nvmet_ana_group_enabled[grpid]++; 1818 1817 up_write(&nvmet_ana_sem); 1819 1818