Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2017-2018 Christoph Hellwig.
4 */
5
6#include <linux/backing-dev.h>
7#include <linux/moduleparam.h>
8#include <linux/vmalloc.h>
9#include <trace/events/block.h>
10#include "nvme.h"
11
12bool multipath = true;
13static bool multipath_always_on;
14
15static int multipath_param_set(const char *val, const struct kernel_param *kp)
16{
17 int ret;
18 bool *arg = kp->arg;
19
20 ret = param_set_bool(val, kp);
21 if (ret)
22 return ret;
23
24 if (multipath_always_on && !*arg) {
25 pr_err("Can't disable multipath when multipath_always_on is configured.\n");
26 *arg = true;
27 return -EINVAL;
28 }
29
30 return 0;
31}
32
33static const struct kernel_param_ops multipath_param_ops = {
34 .set = multipath_param_set,
35 .get = param_get_bool,
36};
37
38module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
39MODULE_PARM_DESC(multipath,
40 "turn on native support for multiple controllers per subsystem");
41
42static int multipath_always_on_set(const char *val,
43 const struct kernel_param *kp)
44{
45 int ret;
46 bool *arg = kp->arg;
47
48 ret = param_set_bool(val, kp);
49 if (ret < 0)
50 return ret;
51
52 if (*arg)
53 multipath = true;
54
55 return 0;
56}
57
58static const struct kernel_param_ops multipath_always_on_ops = {
59 .set = multipath_always_on_set,
60 .get = param_get_bool,
61};
62
63module_param_cb(multipath_always_on, &multipath_always_on_ops,
64 &multipath_always_on, 0444);
65MODULE_PARM_DESC(multipath_always_on,
66 "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
67
68static const char *nvme_iopolicy_names[] = {
69 [NVME_IOPOLICY_NUMA] = "numa",
70 [NVME_IOPOLICY_RR] = "round-robin",
71 [NVME_IOPOLICY_QD] = "queue-depth",
72};
73
74static int iopolicy = NVME_IOPOLICY_NUMA;
75
76static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
77{
78 if (!val)
79 return -EINVAL;
80 if (!strncmp(val, "numa", 4))
81 iopolicy = NVME_IOPOLICY_NUMA;
82 else if (!strncmp(val, "round-robin", 11))
83 iopolicy = NVME_IOPOLICY_RR;
84 else if (!strncmp(val, "queue-depth", 11))
85 iopolicy = NVME_IOPOLICY_QD;
86 else
87 return -EINVAL;
88
89 return 0;
90}
91
92static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
93{
94 return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
95}
96
97module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
98 &iopolicy, 0644);
99MODULE_PARM_DESC(iopolicy,
100 "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
101
102void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
103{
104 subsys->iopolicy = iopolicy;
105}
106
107void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
108{
109 struct nvme_ns_head *h;
110
111 lockdep_assert_held(&subsys->lock);
112 list_for_each_entry(h, &subsys->nsheads, entry)
113 if (h->disk)
114 blk_mq_unfreeze_queue_nomemrestore(h->disk->queue);
115}
116
117void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
118{
119 struct nvme_ns_head *h;
120
121 lockdep_assert_held(&subsys->lock);
122 list_for_each_entry(h, &subsys->nsheads, entry)
123 if (h->disk)
124 blk_mq_freeze_queue_wait(h->disk->queue);
125}
126
127void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
128{
129 struct nvme_ns_head *h;
130
131 lockdep_assert_held(&subsys->lock);
132 list_for_each_entry(h, &subsys->nsheads, entry)
133 if (h->disk)
134 blk_freeze_queue_start(h->disk->queue);
135}
136
137void nvme_failover_req(struct request *req)
138{
139 struct nvme_ns *ns = req->q->queuedata;
140 u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
141 unsigned long flags;
142 struct bio *bio;
143
144 nvme_mpath_clear_current_path(ns);
145
146 /*
147 * If we got back an ANA error, we know the controller is alive but not
148 * ready to serve this namespace. Kick of a re-read of the ANA
149 * information page, and just try any other available path for now.
150 */
151 if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
152 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
153 queue_work(nvme_wq, &ns->ctrl->ana_work);
154 }
155
156 spin_lock_irqsave(&ns->head->requeue_lock, flags);
157 for (bio = req->bio; bio; bio = bio->bi_next)
158 bio_set_dev(bio, ns->head->disk->part0);
159 blk_steal_bios(&ns->head->requeue_list, req);
160 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
161
162 nvme_req(req)->status = 0;
163 nvme_end_req(req);
164 kblockd_schedule_work(&ns->head->requeue_work);
165}
166
167void nvme_mpath_start_request(struct request *rq)
168{
169 struct nvme_ns *ns = rq->q->queuedata;
170 struct gendisk *disk = ns->head->disk;
171
172 if ((READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) &&
173 !(nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)) {
174 atomic_inc(&ns->ctrl->nr_active);
175 nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
176 }
177
178 if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq) ||
179 (nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
180 return;
181
182 nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
183 nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
184 jiffies);
185}
186EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
187
188void nvme_mpath_end_request(struct request *rq)
189{
190 struct nvme_ns *ns = rq->q->queuedata;
191
192 if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
193 atomic_dec_if_positive(&ns->ctrl->nr_active);
194
195 if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
196 return;
197 bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
198 blk_rq_bytes(rq) >> SECTOR_SHIFT,
199 nvme_req(rq)->start_time);
200}
201
202void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
203{
204 struct nvme_ns *ns;
205 int srcu_idx;
206
207 srcu_idx = srcu_read_lock(&ctrl->srcu);
208 list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
209 srcu_read_lock_held(&ctrl->srcu)) {
210 if (!ns->head->disk)
211 continue;
212 kblockd_schedule_work(&ns->head->requeue_work);
213 if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
214 disk_uevent(ns->head->disk, KOBJ_CHANGE);
215 }
216 srcu_read_unlock(&ctrl->srcu, srcu_idx);
217}
218
219static const char *nvme_ana_state_names[] = {
220 [0] = "invalid state",
221 [NVME_ANA_OPTIMIZED] = "optimized",
222 [NVME_ANA_NONOPTIMIZED] = "non-optimized",
223 [NVME_ANA_INACCESSIBLE] = "inaccessible",
224 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
225 [NVME_ANA_CHANGE] = "change",
226};
227
228bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
229{
230 struct nvme_ns_head *head = ns->head;
231 bool changed = false;
232 int node;
233
234 for_each_node(node) {
235 if (ns == rcu_access_pointer(head->current_path[node])) {
236 rcu_assign_pointer(head->current_path[node], NULL);
237 changed = true;
238 }
239 }
240 return changed;
241}
242
243void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
244{
245 struct nvme_ns *ns;
246 int srcu_idx;
247
248 srcu_idx = srcu_read_lock(&ctrl->srcu);
249 list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
250 srcu_read_lock_held(&ctrl->srcu)) {
251 nvme_mpath_clear_current_path(ns);
252 kblockd_schedule_work(&ns->head->requeue_work);
253 }
254 srcu_read_unlock(&ctrl->srcu, srcu_idx);
255}
256
257void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
258{
259 struct nvme_ns_head *head = ns->head;
260 sector_t capacity = get_capacity(head->disk);
261 int node;
262 int srcu_idx;
263
264 srcu_idx = srcu_read_lock(&head->srcu);
265 list_for_each_entry_srcu(ns, &head->list, siblings,
266 srcu_read_lock_held(&head->srcu)) {
267 if (capacity != get_capacity(ns->disk))
268 clear_bit(NVME_NS_READY, &ns->flags);
269 }
270 srcu_read_unlock(&head->srcu, srcu_idx);
271
272 for_each_node(node)
273 rcu_assign_pointer(head->current_path[node], NULL);
274 kblockd_schedule_work(&head->requeue_work);
275}
276
277static bool nvme_path_is_disabled(struct nvme_ns *ns)
278{
279 enum nvme_ctrl_state state = nvme_ctrl_state(ns->ctrl);
280
281 /*
282 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
283 * still be able to complete assuming that the controller is connected.
284 * Otherwise it will fail immediately and return to the requeue list.
285 */
286 if (state != NVME_CTRL_LIVE && state != NVME_CTRL_DELETING)
287 return true;
288 if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
289 !test_bit(NVME_NS_READY, &ns->flags))
290 return true;
291 return false;
292}
293
294static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
295{
296 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
297 struct nvme_ns *found = NULL, *fallback = NULL, *ns;
298
299 list_for_each_entry_srcu(ns, &head->list, siblings,
300 srcu_read_lock_held(&head->srcu)) {
301 if (nvme_path_is_disabled(ns))
302 continue;
303
304 if (ns->ctrl->numa_node != NUMA_NO_NODE &&
305 READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
306 distance = node_distance(node, ns->ctrl->numa_node);
307 else
308 distance = LOCAL_DISTANCE;
309
310 switch (ns->ana_state) {
311 case NVME_ANA_OPTIMIZED:
312 if (distance < found_distance) {
313 found_distance = distance;
314 found = ns;
315 }
316 break;
317 case NVME_ANA_NONOPTIMIZED:
318 if (distance < fallback_distance) {
319 fallback_distance = distance;
320 fallback = ns;
321 }
322 break;
323 default:
324 break;
325 }
326 }
327
328 if (!found)
329 found = fallback;
330 if (found)
331 rcu_assign_pointer(head->current_path[node], found);
332 return found;
333}
334
335static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
336 struct nvme_ns *ns)
337{
338 ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
339 siblings);
340 if (ns)
341 return ns;
342 return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
343}
344
345static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
346{
347 struct nvme_ns *ns, *found = NULL;
348 int node = numa_node_id();
349 struct nvme_ns *old = srcu_dereference(head->current_path[node],
350 &head->srcu);
351
352 if (unlikely(!old))
353 return __nvme_find_path(head, node);
354
355 if (list_is_singular(&head->list)) {
356 if (nvme_path_is_disabled(old))
357 return NULL;
358 return old;
359 }
360
361 for (ns = nvme_next_ns(head, old);
362 ns && ns != old;
363 ns = nvme_next_ns(head, ns)) {
364 if (nvme_path_is_disabled(ns))
365 continue;
366
367 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
368 found = ns;
369 goto out;
370 }
371 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
372 found = ns;
373 }
374
375 /*
376 * The loop above skips the current path for round-robin semantics.
377 * Fall back to the current path if either:
378 * - no other optimized path found and current is optimized,
379 * - no other usable path found and current is usable.
380 */
381 if (!nvme_path_is_disabled(old) &&
382 (old->ana_state == NVME_ANA_OPTIMIZED ||
383 (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
384 return old;
385
386 if (!found)
387 return NULL;
388out:
389 rcu_assign_pointer(head->current_path[node], found);
390 return found;
391}
392
393static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
394{
395 struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
396 unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
397 unsigned int depth;
398
399 list_for_each_entry_srcu(ns, &head->list, siblings,
400 srcu_read_lock_held(&head->srcu)) {
401 if (nvme_path_is_disabled(ns))
402 continue;
403
404 depth = atomic_read(&ns->ctrl->nr_active);
405
406 switch (ns->ana_state) {
407 case NVME_ANA_OPTIMIZED:
408 if (depth < min_depth_opt) {
409 min_depth_opt = depth;
410 best_opt = ns;
411 }
412 break;
413 case NVME_ANA_NONOPTIMIZED:
414 if (depth < min_depth_nonopt) {
415 min_depth_nonopt = depth;
416 best_nonopt = ns;
417 }
418 break;
419 default:
420 break;
421 }
422
423 if (min_depth_opt == 0)
424 return best_opt;
425 }
426
427 return best_opt ? best_opt : best_nonopt;
428}
429
430static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
431{
432 return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
433 ns->ana_state == NVME_ANA_OPTIMIZED;
434}
435
436static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
437{
438 int node = numa_node_id();
439 struct nvme_ns *ns;
440
441 ns = srcu_dereference(head->current_path[node], &head->srcu);
442 if (unlikely(!ns))
443 return __nvme_find_path(head, node);
444 if (unlikely(!nvme_path_is_optimized(ns)))
445 return __nvme_find_path(head, node);
446 return ns;
447}
448
449inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
450{
451 switch (READ_ONCE(head->subsys->iopolicy)) {
452 case NVME_IOPOLICY_QD:
453 return nvme_queue_depth_path(head);
454 case NVME_IOPOLICY_RR:
455 return nvme_round_robin_path(head);
456 default:
457 return nvme_numa_path(head);
458 }
459}
460
461static bool nvme_available_path(struct nvme_ns_head *head)
462{
463 struct nvme_ns *ns;
464
465 if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
466 return false;
467
468 list_for_each_entry_srcu(ns, &head->list, siblings,
469 srcu_read_lock_held(&head->srcu)) {
470 if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
471 continue;
472 switch (nvme_ctrl_state(ns->ctrl)) {
473 case NVME_CTRL_LIVE:
474 case NVME_CTRL_RESETTING:
475 case NVME_CTRL_CONNECTING:
476 return true;
477 default:
478 break;
479 }
480 }
481
482 /*
483 * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
484 * not immediately fail I/O. Instead, requeue the I/O for the configured
485 * duration, anticipating that if there's a transient link failure then
486 * it may recover within this time window. This parameter is exported to
487 * userspace via sysfs, and its default value is zero. It is internally
488 * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
489 * non-zero, this flag is set to true. When zero, the flag is cleared.
490 */
491 return nvme_mpath_queue_if_no_path(head);
492}
493
494static void nvme_ns_head_submit_bio(struct bio *bio)
495{
496 struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
497 struct device *dev = disk_to_dev(head->disk);
498 struct nvme_ns *ns;
499 int srcu_idx;
500
501 /*
502 * The namespace might be going away and the bio might be moved to a
503 * different queue via blk_steal_bios(), so we need to use the bio_split
504 * pool from the original queue to allocate the bvecs from.
505 */
506 bio = bio_split_to_limits(bio);
507 if (!bio)
508 return;
509
510 srcu_idx = srcu_read_lock(&head->srcu);
511 ns = nvme_find_path(head);
512 if (likely(ns)) {
513 bio_set_dev(bio, ns->disk->part0);
514 bio->bi_opf |= REQ_NVME_MPATH;
515 trace_block_bio_remap(bio, disk_devt(ns->head->disk),
516 bio->bi_iter.bi_sector);
517 submit_bio_noacct(bio);
518 } else if (nvme_available_path(head)) {
519 dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
520
521 spin_lock_irq(&head->requeue_lock);
522 bio_list_add(&head->requeue_list, bio);
523 spin_unlock_irq(&head->requeue_lock);
524 } else {
525 dev_warn_ratelimited(dev, "no available path - failing I/O\n");
526
527 bio_io_error(bio);
528 }
529
530 srcu_read_unlock(&head->srcu, srcu_idx);
531}
532
533static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode)
534{
535 if (!nvme_tryget_ns_head(disk->private_data))
536 return -ENXIO;
537 return 0;
538}
539
540static void nvme_ns_head_release(struct gendisk *disk)
541{
542 nvme_put_ns_head(disk->private_data);
543}
544
545static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
546 enum blk_unique_id type)
547{
548 struct nvme_ns_head *head = disk->private_data;
549 struct nvme_ns *ns;
550 int srcu_idx, ret = -EWOULDBLOCK;
551
552 srcu_idx = srcu_read_lock(&head->srcu);
553 ns = nvme_find_path(head);
554 if (ns)
555 ret = nvme_ns_get_unique_id(ns, id, type);
556 srcu_read_unlock(&head->srcu, srcu_idx);
557 return ret;
558}
559
560#ifdef CONFIG_BLK_DEV_ZONED
561static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
562 unsigned int nr_zones, struct blk_report_zones_args *args)
563{
564 struct nvme_ns_head *head = disk->private_data;
565 struct nvme_ns *ns;
566 int srcu_idx, ret = -EWOULDBLOCK;
567
568 srcu_idx = srcu_read_lock(&head->srcu);
569 ns = nvme_find_path(head);
570 if (ns)
571 ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
572 srcu_read_unlock(&head->srcu, srcu_idx);
573 return ret;
574}
575#else
576#define nvme_ns_head_report_zones NULL
577#endif /* CONFIG_BLK_DEV_ZONED */
578
579const struct block_device_operations nvme_ns_head_ops = {
580 .owner = THIS_MODULE,
581 .submit_bio = nvme_ns_head_submit_bio,
582 .open = nvme_ns_head_open,
583 .release = nvme_ns_head_release,
584 .ioctl = nvme_ns_head_ioctl,
585 .compat_ioctl = blkdev_compat_ptr_ioctl,
586 .getgeo = nvme_getgeo,
587 .get_unique_id = nvme_ns_head_get_unique_id,
588 .report_zones = nvme_ns_head_report_zones,
589 .pr_ops = &nvme_pr_ops,
590};
591
592static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
593{
594 return container_of(cdev, struct nvme_ns_head, cdev);
595}
596
597static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
598{
599 if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
600 return -ENXIO;
601 return 0;
602}
603
604static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
605{
606 nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
607 return 0;
608}
609
610static const struct file_operations nvme_ns_head_chr_fops = {
611 .owner = THIS_MODULE,
612 .open = nvme_ns_head_chr_open,
613 .release = nvme_ns_head_chr_release,
614 .unlocked_ioctl = nvme_ns_head_chr_ioctl,
615 .compat_ioctl = compat_ptr_ioctl,
616 .uring_cmd = nvme_ns_head_chr_uring_cmd,
617 .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
618};
619
620static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
621{
622 int ret;
623
624 head->cdev_device.parent = &head->subsys->dev;
625 ret = dev_set_name(&head->cdev_device, "ng%dn%d",
626 head->subsys->instance, head->instance);
627 if (ret)
628 return ret;
629 ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
630 &nvme_ns_head_chr_fops, THIS_MODULE);
631 return ret;
632}
633
634static void nvme_partition_scan_work(struct work_struct *work)
635{
636 struct nvme_ns_head *head =
637 container_of(work, struct nvme_ns_head, partition_scan_work);
638
639 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
640 &head->disk->state)))
641 return;
642
643 mutex_lock(&head->disk->open_mutex);
644 bdev_disk_changed(head->disk, false);
645 mutex_unlock(&head->disk->open_mutex);
646}
647
648static void nvme_requeue_work(struct work_struct *work)
649{
650 struct nvme_ns_head *head =
651 container_of(work, struct nvme_ns_head, requeue_work);
652 struct bio *bio, *next;
653
654 spin_lock_irq(&head->requeue_lock);
655 next = bio_list_get(&head->requeue_list);
656 spin_unlock_irq(&head->requeue_lock);
657
658 while ((bio = next) != NULL) {
659 next = bio->bi_next;
660 bio->bi_next = NULL;
661
662 submit_bio_noacct(bio);
663 }
664}
665
666static void nvme_remove_head(struct nvme_ns_head *head)
667{
668 if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
669 /*
670 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
671 * to allow multipath to fail all I/O.
672 */
673 kblockd_schedule_work(&head->requeue_work);
674
675 nvme_cdev_del(&head->cdev, &head->cdev_device);
676 synchronize_srcu(&head->srcu);
677 del_gendisk(head->disk);
678 }
679 nvme_put_ns_head(head);
680}
681
682static void nvme_remove_head_work(struct work_struct *work)
683{
684 struct nvme_ns_head *head = container_of(to_delayed_work(work),
685 struct nvme_ns_head, remove_work);
686 bool remove = false;
687
688 mutex_lock(&head->subsys->lock);
689 if (list_empty(&head->list)) {
690 list_del_init(&head->entry);
691 remove = true;
692 }
693 mutex_unlock(&head->subsys->lock);
694 if (remove)
695 nvme_remove_head(head);
696
697 module_put(THIS_MODULE);
698}
699
700int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
701{
702 struct queue_limits lim;
703
704 mutex_init(&head->lock);
705 bio_list_init(&head->requeue_list);
706 spin_lock_init(&head->requeue_lock);
707 INIT_WORK(&head->requeue_work, nvme_requeue_work);
708 INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
709 INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
710 head->delayed_removal_secs = 0;
711
712 /*
713 * If "multipath_always_on" is enabled, a multipath node is added
714 * regardless of whether the disk is single/multi ported, and whether
715 * the namespace is shared or private. If "multipath_always_on" is not
716 * enabled, a multipath node is added only if the subsystem supports
717 * multiple controllers and the "multipath" option is configured. In
718 * either case, for private namespaces, we ensure that the NSID is
719 * unique.
720 */
721 if (!multipath_always_on) {
722 if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
723 !multipath)
724 return 0;
725 }
726
727 if (!nvme_is_unique_nsid(ctrl, head))
728 return 0;
729
730 blk_set_stacking_limits(&lim);
731 lim.dma_alignment = 3;
732 lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
733 BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
734 if (head->ids.csi == NVME_CSI_ZNS)
735 lim.features |= BLK_FEAT_ZONED;
736
737 head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
738 if (IS_ERR(head->disk))
739 return PTR_ERR(head->disk);
740 head->disk->fops = &nvme_ns_head_ops;
741 head->disk->private_data = head;
742
743 /*
744 * We need to suppress the partition scan from occuring within the
745 * controller's scan_work context. If a path error occurs here, the IO
746 * will wait until a path becomes available or all paths are torn down,
747 * but that action also occurs within scan_work, so it would deadlock.
748 * Defer the partition scan to a different context that does not block
749 * scan_work.
750 */
751 set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
752 sprintf(head->disk->disk_name, "nvme%dn%d",
753 ctrl->subsys->instance, head->instance);
754 nvme_tryget_ns_head(head);
755 return 0;
756}
757
758static void nvme_mpath_set_live(struct nvme_ns *ns)
759{
760 struct nvme_ns_head *head = ns->head;
761 int rc;
762
763 if (!head->disk)
764 return;
765
766 /*
767 * test_and_set_bit() is used because it is protecting against two nvme
768 * paths simultaneously calling device_add_disk() on the same namespace
769 * head.
770 */
771 if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
772 rc = device_add_disk(&head->subsys->dev, head->disk,
773 nvme_ns_attr_groups);
774 if (rc) {
775 clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags);
776 return;
777 }
778 nvme_add_ns_head_cdev(head);
779 queue_work(nvme_wq, &head->partition_scan_work);
780 }
781
782 nvme_mpath_add_sysfs_link(ns->head);
783
784 mutex_lock(&head->lock);
785 if (nvme_path_is_optimized(ns)) {
786 int node, srcu_idx;
787
788 srcu_idx = srcu_read_lock(&head->srcu);
789 for_each_online_node(node)
790 __nvme_find_path(head, node);
791 srcu_read_unlock(&head->srcu, srcu_idx);
792 }
793 mutex_unlock(&head->lock);
794
795 synchronize_srcu(&head->srcu);
796 kblockd_schedule_work(&head->requeue_work);
797}
798
799static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
800 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
801 void *))
802{
803 void *base = ctrl->ana_log_buf;
804 size_t offset = sizeof(struct nvme_ana_rsp_hdr);
805 int error, i;
806
807 lockdep_assert_held(&ctrl->ana_lock);
808
809 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
810 struct nvme_ana_group_desc *desc = base + offset;
811 u32 nr_nsids;
812 size_t nsid_buf_size;
813
814 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
815 return -EINVAL;
816
817 nr_nsids = le32_to_cpu(desc->nnsids);
818 nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
819
820 if (WARN_ON_ONCE(desc->grpid == 0))
821 return -EINVAL;
822 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
823 return -EINVAL;
824 if (WARN_ON_ONCE(desc->state == 0))
825 return -EINVAL;
826 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
827 return -EINVAL;
828
829 offset += sizeof(*desc);
830 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
831 return -EINVAL;
832
833 error = cb(ctrl, desc, data);
834 if (error)
835 return error;
836
837 offset += nsid_buf_size;
838 }
839
840 return 0;
841}
842
843static inline bool nvme_state_is_live(enum nvme_ana_state state)
844{
845 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
846}
847
848static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
849 struct nvme_ns *ns)
850{
851 ns->ana_grpid = le32_to_cpu(desc->grpid);
852 ns->ana_state = desc->state;
853 clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
854 /*
855 * nvme_mpath_set_live() will trigger I/O to the multipath path device
856 * and in turn to this path device. However we cannot accept this I/O
857 * if the controller is not live. This may deadlock if called from
858 * nvme_mpath_init_identify() and the ctrl will never complete
859 * initialization, preventing I/O from completing. For this case we
860 * will reprocess the ANA log page in nvme_mpath_update() once the
861 * controller is ready.
862 */
863 if (nvme_state_is_live(ns->ana_state) &&
864 nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
865 nvme_mpath_set_live(ns);
866 else {
867 /*
868 * Add sysfs link from multipath head gendisk node to path
869 * device gendisk node.
870 * If path's ana state is live (i.e. state is either optimized
871 * or non-optimized) while we alloc the ns then sysfs link would
872 * be created from nvme_mpath_set_live(). In that case we would
873 * not fallthrough this code path. However for the path's ana
874 * state other than live, we call nvme_mpath_set_live() only
875 * after ana state transitioned to the live state. But we still
876 * want to create the sysfs link from head node to a path device
877 * irrespctive of the path's ana state.
878 * If we reach through here then it means that path's ana state
879 * is not live but still create the sysfs link to this path from
880 * head node if head node of the path has already come alive.
881 */
882 if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
883 nvme_mpath_add_sysfs_link(ns->head);
884 }
885}
886
887static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
888 struct nvme_ana_group_desc *desc, void *data)
889{
890 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
891 unsigned *nr_change_groups = data;
892 struct nvme_ns *ns;
893 int srcu_idx;
894
895 dev_dbg(ctrl->device, "ANA group %d: %s.\n",
896 le32_to_cpu(desc->grpid),
897 nvme_ana_state_names[desc->state]);
898
899 if (desc->state == NVME_ANA_CHANGE)
900 (*nr_change_groups)++;
901
902 if (!nr_nsids)
903 return 0;
904
905 srcu_idx = srcu_read_lock(&ctrl->srcu);
906 list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
907 srcu_read_lock_held(&ctrl->srcu)) {
908 unsigned nsid;
909again:
910 nsid = le32_to_cpu(desc->nsids[n]);
911 if (ns->head->ns_id < nsid)
912 continue;
913 if (ns->head->ns_id == nsid)
914 nvme_update_ns_ana_state(desc, ns);
915 if (++n == nr_nsids)
916 break;
917 if (ns->head->ns_id > nsid)
918 goto again;
919 }
920 srcu_read_unlock(&ctrl->srcu, srcu_idx);
921 return 0;
922}
923
924static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
925{
926 u32 nr_change_groups = 0;
927 int error;
928
929 mutex_lock(&ctrl->ana_lock);
930 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
931 ctrl->ana_log_buf, ctrl->ana_log_size, 0);
932 if (error) {
933 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
934 goto out_unlock;
935 }
936
937 error = nvme_parse_ana_log(ctrl, &nr_change_groups,
938 nvme_update_ana_state);
939 if (error)
940 goto out_unlock;
941
942 /*
943 * In theory we should have an ANATT timer per group as they might enter
944 * the change state at different times. But that is a lot of overhead
945 * just to protect against a target that keeps entering new changes
946 * states while never finishing previous ones. But we'll still
947 * eventually time out once all groups are in change state, so this
948 * isn't a big deal.
949 *
950 * We also double the ANATT value to provide some slack for transports
951 * or AEN processing overhead.
952 */
953 if (nr_change_groups)
954 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
955 else
956 timer_delete_sync(&ctrl->anatt_timer);
957out_unlock:
958 mutex_unlock(&ctrl->ana_lock);
959 return error;
960}
961
962static void nvme_ana_work(struct work_struct *work)
963{
964 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
965
966 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
967 return;
968
969 nvme_read_ana_log(ctrl);
970}
971
972void nvme_mpath_update(struct nvme_ctrl *ctrl)
973{
974 u32 nr_change_groups = 0;
975
976 if (!ctrl->ana_log_buf)
977 return;
978
979 mutex_lock(&ctrl->ana_lock);
980 nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
981 mutex_unlock(&ctrl->ana_lock);
982}
983
984static void nvme_anatt_timeout(struct timer_list *t)
985{
986 struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
987
988 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
989 nvme_reset_ctrl(ctrl);
990}
991
992void nvme_mpath_stop(struct nvme_ctrl *ctrl)
993{
994 if (!nvme_ctrl_use_ana(ctrl))
995 return;
996 timer_delete_sync(&ctrl->anatt_timer);
997 cancel_work_sync(&ctrl->ana_work);
998}
999
1000#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
1001 struct device_attribute subsys_attr_##_name = \
1002 __ATTR(_name, _mode, _show, _store)
1003
1004static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
1005 struct device_attribute *attr, char *buf)
1006{
1007 struct nvme_subsystem *subsys =
1008 container_of(dev, struct nvme_subsystem, dev);
1009
1010 return sysfs_emit(buf, "%s\n",
1011 nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
1012}
1013
1014static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
1015 int iopolicy)
1016{
1017 struct nvme_ctrl *ctrl;
1018 int old_iopolicy = READ_ONCE(subsys->iopolicy);
1019
1020 if (old_iopolicy == iopolicy)
1021 return;
1022
1023 WRITE_ONCE(subsys->iopolicy, iopolicy);
1024
1025 /* iopolicy changes clear the mpath by design */
1026 mutex_lock(&nvme_subsystems_lock);
1027 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
1028 nvme_mpath_clear_ctrl_paths(ctrl);
1029 mutex_unlock(&nvme_subsystems_lock);
1030
1031 pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
1032 subsys->subnqn,
1033 nvme_iopolicy_names[old_iopolicy],
1034 nvme_iopolicy_names[iopolicy]);
1035}
1036
1037static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
1038 struct device_attribute *attr, const char *buf, size_t count)
1039{
1040 struct nvme_subsystem *subsys =
1041 container_of(dev, struct nvme_subsystem, dev);
1042 int i;
1043
1044 for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
1045 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
1046 nvme_subsys_iopolicy_update(subsys, i);
1047 return count;
1048 }
1049 }
1050
1051 return -EINVAL;
1052}
1053SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
1054 nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
1055
1056static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
1057 char *buf)
1058{
1059 return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
1060}
1061DEVICE_ATTR_RO(ana_grpid);
1062
1063static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
1064 char *buf)
1065{
1066 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1067
1068 return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
1069}
1070DEVICE_ATTR_RO(ana_state);
1071
1072static ssize_t queue_depth_show(struct device *dev,
1073 struct device_attribute *attr, char *buf)
1074{
1075 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1076
1077 if (ns->head->subsys->iopolicy != NVME_IOPOLICY_QD)
1078 return 0;
1079
1080 return sysfs_emit(buf, "%d\n", atomic_read(&ns->ctrl->nr_active));
1081}
1082DEVICE_ATTR_RO(queue_depth);
1083
1084static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr,
1085 char *buf)
1086{
1087 int node, srcu_idx;
1088 nodemask_t numa_nodes;
1089 struct nvme_ns *current_ns;
1090 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
1091 struct nvme_ns_head *head = ns->head;
1092
1093 if (head->subsys->iopolicy != NVME_IOPOLICY_NUMA)
1094 return 0;
1095
1096 nodes_clear(numa_nodes);
1097
1098 srcu_idx = srcu_read_lock(&head->srcu);
1099 for_each_node(node) {
1100 current_ns = srcu_dereference(head->current_path[node],
1101 &head->srcu);
1102 if (ns == current_ns)
1103 node_set(node, numa_nodes);
1104 }
1105 srcu_read_unlock(&head->srcu, srcu_idx);
1106
1107 return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&numa_nodes));
1108}
1109DEVICE_ATTR_RO(numa_nodes);
1110
1111static ssize_t delayed_removal_secs_show(struct device *dev,
1112 struct device_attribute *attr, char *buf)
1113{
1114 struct gendisk *disk = dev_to_disk(dev);
1115 struct nvme_ns_head *head = disk->private_data;
1116 int ret;
1117
1118 mutex_lock(&head->subsys->lock);
1119 ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
1120 mutex_unlock(&head->subsys->lock);
1121 return ret;
1122}
1123
1124static ssize_t delayed_removal_secs_store(struct device *dev,
1125 struct device_attribute *attr, const char *buf, size_t count)
1126{
1127 struct gendisk *disk = dev_to_disk(dev);
1128 struct nvme_ns_head *head = disk->private_data;
1129 unsigned int sec;
1130 int ret;
1131
1132 ret = kstrtouint(buf, 0, &sec);
1133 if (ret < 0)
1134 return ret;
1135
1136 mutex_lock(&head->subsys->lock);
1137 head->delayed_removal_secs = sec;
1138 if (sec)
1139 set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1140 else
1141 clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
1142 mutex_unlock(&head->subsys->lock);
1143 /*
1144 * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
1145 * by its reader.
1146 */
1147 synchronize_srcu(&head->srcu);
1148
1149 return count;
1150}
1151
1152DEVICE_ATTR_RW(delayed_removal_secs);
1153
1154static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
1155 struct nvme_ana_group_desc *desc, void *data)
1156{
1157 struct nvme_ana_group_desc *dst = data;
1158
1159 if (desc->grpid != dst->grpid)
1160 return 0;
1161
1162 *dst = *desc;
1163 return -ENXIO; /* just break out of the loop */
1164}
1165
1166void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
1167{
1168 struct device *target;
1169 int rc, srcu_idx;
1170 struct nvme_ns *ns;
1171 struct kobject *kobj;
1172
1173 /*
1174 * Ensure head disk node is already added otherwise we may get invalid
1175 * kobj for head disk node
1176 */
1177 if (!test_bit(GD_ADDED, &head->disk->state))
1178 return;
1179
1180 kobj = &disk_to_dev(head->disk)->kobj;
1181
1182 /*
1183 * loop through each ns chained through the head->list and create the
1184 * sysfs link from head node to the ns path node
1185 */
1186 srcu_idx = srcu_read_lock(&head->srcu);
1187
1188 list_for_each_entry_srcu(ns, &head->list, siblings,
1189 srcu_read_lock_held(&head->srcu)) {
1190 /*
1191 * Ensure that ns path disk node is already added otherwise we
1192 * may get invalid kobj name for target
1193 */
1194 if (!test_bit(GD_ADDED, &ns->disk->state))
1195 continue;
1196
1197 /*
1198 * Avoid creating link if it already exists for the given path.
1199 * When path ana state transitions from optimized to non-
1200 * optimized or vice-versa, the nvme_mpath_set_live() is
1201 * invoked which in truns call this function. Now if the sysfs
1202 * link already exists for the given path and we attempt to re-
1203 * create the link then sysfs code would warn about it loudly.
1204 * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
1205 * that we're not creating duplicate link.
1206 * The test_and_set_bit() is used because it is protecting
1207 * against multiple nvme paths being simultaneously added.
1208 */
1209 if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1210 continue;
1211
1212 target = disk_to_dev(ns->disk);
1213 /*
1214 * Create sysfs link from head gendisk kobject @kobj to the
1215 * ns path gendisk kobject @target->kobj.
1216 */
1217 rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
1218 &target->kobj, dev_name(target));
1219 if (unlikely(rc)) {
1220 dev_err(disk_to_dev(ns->head->disk),
1221 "failed to create link to %s\n",
1222 dev_name(target));
1223 clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1224 }
1225 }
1226
1227 srcu_read_unlock(&head->srcu, srcu_idx);
1228}
1229
1230void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
1231{
1232 struct device *target;
1233 struct kobject *kobj;
1234
1235 if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
1236 return;
1237
1238 target = disk_to_dev(ns->disk);
1239 kobj = &disk_to_dev(ns->head->disk)->kobj;
1240 sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
1241 dev_name(target));
1242 clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
1243}
1244
1245void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
1246{
1247 if (nvme_ctrl_use_ana(ns->ctrl)) {
1248 struct nvme_ana_group_desc desc = {
1249 .grpid = anagrpid,
1250 .state = 0,
1251 };
1252
1253 mutex_lock(&ns->ctrl->ana_lock);
1254 ns->ana_grpid = le32_to_cpu(anagrpid);
1255 nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
1256 mutex_unlock(&ns->ctrl->ana_lock);
1257 if (desc.state) {
1258 /* found the group desc: update */
1259 nvme_update_ns_ana_state(&desc, ns);
1260 } else {
1261 /* group desc not found: trigger a re-read */
1262 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
1263 queue_work(nvme_wq, &ns->ctrl->ana_work);
1264 }
1265 } else {
1266 ns->ana_state = NVME_ANA_OPTIMIZED;
1267 nvme_mpath_set_live(ns);
1268 }
1269
1270#ifdef CONFIG_BLK_DEV_ZONED
1271 if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
1272 ns->head->disk->nr_zones = ns->disk->nr_zones;
1273#endif
1274}
1275
1276void nvme_mpath_remove_disk(struct nvme_ns_head *head)
1277{
1278 bool remove = false;
1279
1280 if (!head->disk)
1281 return;
1282
1283 mutex_lock(&head->subsys->lock);
1284 /*
1285 * We are called when all paths have been removed, and at that point
1286 * head->list is expected to be empty. However, nvme_ns_remove() and
1287 * nvme_init_ns_head() can run concurrently and so if head->delayed_
1288 * removal_secs is configured, it is possible that by the time we reach
1289 * this point, head->list may no longer be empty. Therefore, we recheck
1290 * head->list here. If it is no longer empty then we skip enqueuing the
1291 * delayed head removal work.
1292 */
1293 if (!list_empty(&head->list))
1294 goto out;
1295
1296 /*
1297 * Ensure that no one could remove this module while the head
1298 * remove work is pending.
1299 */
1300 if (head->delayed_removal_secs && try_module_get(THIS_MODULE)) {
1301 mod_delayed_work(nvme_wq, &head->remove_work,
1302 head->delayed_removal_secs * HZ);
1303 } else {
1304 list_del_init(&head->entry);
1305 remove = true;
1306 }
1307out:
1308 mutex_unlock(&head->subsys->lock);
1309 if (remove)
1310 nvme_remove_head(head);
1311}
1312
1313void nvme_mpath_put_disk(struct nvme_ns_head *head)
1314{
1315 if (!head->disk)
1316 return;
1317 /* make sure all pending bios are cleaned up */
1318 kblockd_schedule_work(&head->requeue_work);
1319 flush_work(&head->requeue_work);
1320 flush_work(&head->partition_scan_work);
1321 put_disk(head->disk);
1322}
1323
1324void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
1325{
1326 mutex_init(&ctrl->ana_lock);
1327 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
1328 INIT_WORK(&ctrl->ana_work, nvme_ana_work);
1329}
1330
1331int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
1332{
1333 size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
1334 size_t ana_log_size;
1335 int error = 0;
1336
1337 /* check if multipath is enabled and we have the capability */
1338 if (!multipath || !ctrl->subsys ||
1339 !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
1340 return 0;
1341
1342 /* initialize this in the identify path to cover controller resets */
1343 atomic_set(&ctrl->nr_active, 0);
1344
1345 if (!ctrl->max_namespaces ||
1346 ctrl->max_namespaces > le32_to_cpu(id->nn)) {
1347 dev_err(ctrl->device,
1348 "Invalid MNAN value %u\n", ctrl->max_namespaces);
1349 return -EINVAL;
1350 }
1351
1352 ctrl->anacap = id->anacap;
1353 ctrl->anatt = id->anatt;
1354 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
1355 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
1356
1357 ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
1358 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
1359 ctrl->max_namespaces * sizeof(__le32);
1360 if (ana_log_size > max_transfer_size) {
1361 dev_err(ctrl->device,
1362 "ANA log page size (%zd) larger than MDTS (%zd).\n",
1363 ana_log_size, max_transfer_size);
1364 dev_err(ctrl->device, "disabling ANA support.\n");
1365 goto out_uninit;
1366 }
1367 if (ana_log_size > ctrl->ana_log_size) {
1368 nvme_mpath_stop(ctrl);
1369 nvme_mpath_uninit(ctrl);
1370 ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
1371 if (!ctrl->ana_log_buf)
1372 return -ENOMEM;
1373 }
1374 ctrl->ana_log_size = ana_log_size;
1375 error = nvme_read_ana_log(ctrl);
1376 if (error)
1377 goto out_uninit;
1378 return 0;
1379
1380out_uninit:
1381 nvme_mpath_uninit(ctrl);
1382 return error;
1383}
1384
1385void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
1386{
1387 kvfree(ctrl->ana_log_buf);
1388 ctrl->ana_log_buf = NULL;
1389 ctrl->ana_log_size = 0;
1390}