Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
"A collection of fixes that should go into this series. This contains:

- A set of NVMe fixes, pulled from Christoph. This includes a set of
fixes for the fiber channel bits from James Smart, rdma queue depth
fix from Marta, controller removal fixes from Ming, and some more
APST quirk updates from Andy.

- A blk-mq debugfs fix from Bart, fixing a problem with the
untangling of the sysfs and debugfs blk-mq bits that was added in
this series.

- Error code fix in add_partition() from Dan.

- A small series of fixes for the new blk-throttle code from Shaohua"

* 'for-linus' of git://git.kernel.dk/linux-block: (21 commits)
blk-mq: Only register debugfs attributes for blk-mq queues
nvme: Quirk APST on Intel 600P/P3100 devices
nvme: only setup block integrity if supported by the driver
nvme: replace is_flags field in nvme_ctrl_ops with a flags field
nvme-pci: consistencly use ctrl->device for logging
partitions/msdos: FreeBSD UFS2 file systems are not recognized
block: fix an error code in add_partition()
blk-throttle: force user to configure all settings for io.low
blk-throttle: respect 0 bps/iops settings for io.low
blk-throttle: output some debug info in trace
blk-throttle: add hierarchy support for latency target and idle time
nvme_fc: remove extra controller reference taken on reconnect
nvme_fc: correct nvme status set on abort
nvme_fc: set logging level on resets/deletes
nvme_fc: revise comment on teardown
nvme_fc: Support ctrl_loss_tmo
nvme_fc: get rid of local reconnect_delay
blk-mq: remove blk_mq_abort_requeue_list()
nvme: avoid to use blk_mq_abort_requeue_list()
nvme: use blk_mq_start_hw_queues() in nvme_kill_queues()
...

+248 -211
-19
block/blk-mq.c
··· 628 628 } 629 629 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 630 630 631 - void blk_mq_abort_requeue_list(struct request_queue *q) 632 - { 633 - unsigned long flags; 634 - LIST_HEAD(rq_list); 635 - 636 - spin_lock_irqsave(&q->requeue_lock, flags); 637 - list_splice_init(&q->requeue_list, &rq_list); 638 - spin_unlock_irqrestore(&q->requeue_lock, flags); 639 - 640 - while (!list_empty(&rq_list)) { 641 - struct request *rq; 642 - 643 - rq = list_first_entry(&rq_list, struct request, queuelist); 644 - list_del_init(&rq->queuelist); 645 - blk_mq_end_request(rq, -EIO); 646 - } 647 - } 648 - EXPORT_SYMBOL(blk_mq_abort_requeue_list); 649 - 650 631 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 651 632 { 652 633 if (tag < tags->nr_tags) {
+3 -3
block/blk-sysfs.c
··· 887 887 goto unlock; 888 888 } 889 889 890 - if (q->mq_ops) 890 + if (q->mq_ops) { 891 891 __blk_mq_register_dev(dev, q); 892 - 893 - blk_mq_debugfs_register(q); 892 + blk_mq_debugfs_register(q); 893 + } 894 894 895 895 kobject_uevent(&q->kobj, KOBJ_ADD); 896 896
+108 -62
block/blk-throttle.c
··· 22 22 #define DFL_THROTL_SLICE_HD (HZ / 10) 23 23 #define DFL_THROTL_SLICE_SSD (HZ / 50) 24 24 #define MAX_THROTL_SLICE (HZ) 25 - #define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ 26 - #define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ 27 25 #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ 28 - /* default latency target is 0, eg, guarantee IO latency by default */ 29 - #define DFL_LATENCY_TARGET (0) 26 + #define MIN_THROTL_BPS (320 * 1024) 27 + #define MIN_THROTL_IOPS (10) 28 + #define DFL_LATENCY_TARGET (-1L) 29 + #define DFL_IDLE_THRESHOLD (0) 30 30 31 31 #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) 32 32 ··· 157 157 unsigned long last_check_time; 158 158 159 159 unsigned long latency_target; /* us */ 160 + unsigned long latency_target_conf; /* us */ 160 161 /* When did we start a new slice */ 161 162 unsigned long slice_start[2]; 162 163 unsigned long slice_end[2]; ··· 166 165 unsigned long checked_last_finish_time; /* ns / 1024 */ 167 166 unsigned long avg_idletime; /* ns / 1024 */ 168 167 unsigned long idletime_threshold; /* us */ 168 + unsigned long idletime_threshold_conf; /* us */ 169 169 170 170 unsigned int bio_cnt; /* total bios */ 171 171 unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ ··· 202 200 struct work_struct dispatch_work; 203 201 unsigned int limit_index; 204 202 bool limit_valid[LIMIT_CNT]; 205 - 206 - unsigned long dft_idletime_threshold; /* us */ 207 203 208 204 unsigned long low_upgrade_time; 209 205 unsigned long low_downgrade_time; ··· 294 294 295 295 td = tg->td; 296 296 ret = tg->bps[rw][td->limit_index]; 297 - if (ret == 0 && td->limit_index == LIMIT_LOW) 298 - return tg->bps[rw][LIMIT_MAX]; 297 + if (ret == 0 && td->limit_index == LIMIT_LOW) { 298 + /* intermediate node or iops isn't 0 */ 299 + if (!list_empty(&blkg->blkcg->css.children) || 300 + tg->iops[rw][td->limit_index]) 301 + return U64_MAX; 302 + else 303 + return MIN_THROTL_BPS; 304 + } 299 305 300 306 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && 301 307 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { ··· 321 315 322 316 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 323 317 return UINT_MAX; 318 + 324 319 td = tg->td; 325 320 ret = tg->iops[rw][td->limit_index]; 326 - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) 327 - return tg->iops[rw][LIMIT_MAX]; 321 + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { 322 + /* intermediate node or bps isn't 0 */ 323 + if (!list_empty(&blkg->blkcg->css.children) || 324 + tg->bps[rw][td->limit_index]) 325 + return UINT_MAX; 326 + else 327 + return MIN_THROTL_IOPS; 328 + } 328 329 329 330 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && 330 331 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { ··· 495 482 /* LIMIT_LOW will have default value 0 */ 496 483 497 484 tg->latency_target = DFL_LATENCY_TARGET; 485 + tg->latency_target_conf = DFL_LATENCY_TARGET; 486 + tg->idletime_threshold = DFL_IDLE_THRESHOLD; 487 + tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; 498 488 499 489 return &tg->pd; 500 490 } ··· 526 510 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 527 511 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 528 512 tg->td = td; 529 - 530 - tg->idletime_threshold = td->dft_idletime_threshold; 531 513 } 532 514 533 515 /* ··· 1363 1349 return 0; 1364 1350 } 1365 1351 1366 - static void tg_conf_updated(struct throtl_grp *tg) 1352 + static void tg_conf_updated(struct throtl_grp *tg, bool global) 1367 1353 { 1368 1354 struct throtl_service_queue *sq = &tg->service_queue; 1369 1355 struct cgroup_subsys_state *pos_css; ··· 1381 1367 * restrictions in the whole hierarchy and allows them to bypass 1382 1368 * blk-throttle. 1383 1369 */ 1384 - blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) 1385 - tg_update_has_rules(blkg_to_tg(blkg)); 1370 + blkg_for_each_descendant_pre(blkg, pos_css, 1371 + global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1372 + struct throtl_grp *this_tg = blkg_to_tg(blkg); 1373 + struct throtl_grp *parent_tg; 1374 + 1375 + tg_update_has_rules(this_tg); 1376 + /* ignore root/second level */ 1377 + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1378 + !blkg->parent->parent) 1379 + continue; 1380 + parent_tg = blkg_to_tg(blkg->parent); 1381 + /* 1382 + * make sure all children has lower idle time threshold and 1383 + * higher latency target 1384 + */ 1385 + this_tg->idletime_threshold = min(this_tg->idletime_threshold, 1386 + parent_tg->idletime_threshold); 1387 + this_tg->latency_target = max(this_tg->latency_target, 1388 + parent_tg->latency_target); 1389 + } 1386 1390 1387 1391 /* 1388 1392 * We're already holding queue_lock and know @tg is valid. Let's ··· 1445 1413 else 1446 1414 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1447 1415 1448 - tg_conf_updated(tg); 1416 + tg_conf_updated(tg, false); 1449 1417 ret = 0; 1450 1418 out_finish: 1451 1419 blkg_conf_finish(&ctx); ··· 1529 1497 tg->iops_conf[READ][off] == iops_dft && 1530 1498 tg->iops_conf[WRITE][off] == iops_dft && 1531 1499 (off != LIMIT_LOW || 1532 - (tg->idletime_threshold == tg->td->dft_idletime_threshold && 1533 - tg->latency_target == DFL_LATENCY_TARGET))) 1500 + (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && 1501 + tg->latency_target_conf == DFL_LATENCY_TARGET))) 1534 1502 return 0; 1535 1503 1536 - if (tg->bps_conf[READ][off] != bps_dft) 1504 + if (tg->bps_conf[READ][off] != U64_MAX) 1537 1505 snprintf(bufs[0], sizeof(bufs[0]), "%llu", 1538 1506 tg->bps_conf[READ][off]); 1539 - if (tg->bps_conf[WRITE][off] != bps_dft) 1507 + if (tg->bps_conf[WRITE][off] != U64_MAX) 1540 1508 snprintf(bufs[1], sizeof(bufs[1]), "%llu", 1541 1509 tg->bps_conf[WRITE][off]); 1542 - if (tg->iops_conf[READ][off] != iops_dft) 1510 + if (tg->iops_conf[READ][off] != UINT_MAX) 1543 1511 snprintf(bufs[2], sizeof(bufs[2]), "%u", 1544 1512 tg->iops_conf[READ][off]); 1545 - if (tg->iops_conf[WRITE][off] != iops_dft) 1513 + if (tg->iops_conf[WRITE][off] != UINT_MAX) 1546 1514 snprintf(bufs[3], sizeof(bufs[3]), "%u", 1547 1515 tg->iops_conf[WRITE][off]); 1548 1516 if (off == LIMIT_LOW) { 1549 - if (tg->idletime_threshold == ULONG_MAX) 1517 + if (tg->idletime_threshold_conf == ULONG_MAX) 1550 1518 strcpy(idle_time, " idle=max"); 1551 1519 else 1552 1520 snprintf(idle_time, sizeof(idle_time), " idle=%lu", 1553 - tg->idletime_threshold); 1521 + tg->idletime_threshold_conf); 1554 1522 1555 - if (tg->latency_target == ULONG_MAX) 1523 + if (tg->latency_target_conf == ULONG_MAX) 1556 1524 strcpy(latency_time, " latency=max"); 1557 1525 else 1558 1526 snprintf(latency_time, sizeof(latency_time), 1559 - " latency=%lu", tg->latency_target); 1527 + " latency=%lu", tg->latency_target_conf); 1560 1528 } 1561 1529 1562 1530 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", ··· 1595 1563 v[2] = tg->iops_conf[READ][index]; 1596 1564 v[3] = tg->iops_conf[WRITE][index]; 1597 1565 1598 - idle_time = tg->idletime_threshold; 1599 - latency_time = tg->latency_target; 1566 + idle_time = tg->idletime_threshold_conf; 1567 + latency_time = tg->latency_target_conf; 1600 1568 while (true) { 1601 1569 char tok[27]; /* wiops=18446744073709551616 */ 1602 1570 char *p; ··· 1655 1623 tg->iops_conf[READ][LIMIT_MAX]); 1656 1624 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], 1657 1625 tg->iops_conf[WRITE][LIMIT_MAX]); 1626 + tg->idletime_threshold_conf = idle_time; 1627 + tg->latency_target_conf = latency_time; 1658 1628 1659 - if (index == LIMIT_LOW) { 1660 - blk_throtl_update_limit_valid(tg->td); 1661 - if (tg->td->limit_valid[LIMIT_LOW]) 1662 - tg->td->limit_index = LIMIT_LOW; 1663 - tg->idletime_threshold = (idle_time == ULONG_MAX) ? 1664 - ULONG_MAX : idle_time; 1665 - tg->latency_target = (latency_time == ULONG_MAX) ? 1666 - ULONG_MAX : latency_time; 1629 + /* force user to configure all settings for low limit */ 1630 + if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || 1631 + tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || 1632 + tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || 1633 + tg->latency_target_conf == DFL_LATENCY_TARGET) { 1634 + tg->bps[READ][LIMIT_LOW] = 0; 1635 + tg->bps[WRITE][LIMIT_LOW] = 0; 1636 + tg->iops[READ][LIMIT_LOW] = 0; 1637 + tg->iops[WRITE][LIMIT_LOW] = 0; 1638 + tg->idletime_threshold = DFL_IDLE_THRESHOLD; 1639 + tg->latency_target = DFL_LATENCY_TARGET; 1640 + } else if (index == LIMIT_LOW) { 1641 + tg->idletime_threshold = tg->idletime_threshold_conf; 1642 + tg->latency_target = tg->latency_target_conf; 1667 1643 } 1668 - tg_conf_updated(tg); 1644 + 1645 + blk_throtl_update_limit_valid(tg->td); 1646 + if (tg->td->limit_valid[LIMIT_LOW]) { 1647 + if (index == LIMIT_LOW) 1648 + tg->td->limit_index = LIMIT_LOW; 1649 + } else 1650 + tg->td->limit_index = LIMIT_MAX; 1651 + tg_conf_updated(tg, index == LIMIT_LOW && 1652 + tg->td->limit_valid[LIMIT_LOW]); 1669 1653 ret = 0; 1670 1654 out_finish: 1671 1655 blkg_conf_finish(&ctx); ··· 1770 1722 /* 1771 1723 * cgroup is idle if: 1772 1724 * - single idle is too long, longer than a fixed value (in case user 1773 - * configure a too big threshold) or 4 times of slice 1725 + * configure a too big threshold) or 4 times of idletime threshold 1774 1726 * - average think time is more than threshold 1775 1727 * - IO latency is largely below threshold 1776 1728 */ 1777 - unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); 1729 + unsigned long time; 1730 + bool ret; 1778 1731 1779 - time = min_t(unsigned long, MAX_IDLE_TIME, time); 1780 - return (ktime_get_ns() >> 10) - tg->last_finish_time > time || 1781 - tg->avg_idletime > tg->idletime_threshold || 1782 - (tg->latency_target && tg->bio_cnt && 1732 + time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); 1733 + ret = tg->latency_target == DFL_LATENCY_TARGET || 1734 + tg->idletime_threshold == DFL_IDLE_THRESHOLD || 1735 + (ktime_get_ns() >> 10) - tg->last_finish_time > time || 1736 + tg->avg_idletime > tg->idletime_threshold || 1737 + (tg->latency_target && tg->bio_cnt && 1783 1738 tg->bad_bio_cnt * 5 < tg->bio_cnt); 1739 + throtl_log(&tg->service_queue, 1740 + "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", 1741 + tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, 1742 + tg->bio_cnt, ret, tg->td->scale); 1743 + return ret; 1784 1744 } 1785 1745 1786 1746 static bool throtl_tg_can_upgrade(struct throtl_grp *tg) ··· 1884 1828 struct cgroup_subsys_state *pos_css; 1885 1829 struct blkcg_gq *blkg; 1886 1830 1831 + throtl_log(&td->service_queue, "upgrade to max"); 1887 1832 td->limit_index = LIMIT_MAX; 1888 1833 td->low_upgrade_time = jiffies; 1889 1834 td->scale = 0; ··· 1907 1850 { 1908 1851 td->scale /= 2; 1909 1852 1853 + throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); 1910 1854 if (td->scale) { 1911 1855 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; 1912 1856 return; ··· 2081 2023 td->avg_buckets[i].valid = true; 2082 2024 last_latency = td->avg_buckets[i].latency; 2083 2025 } 2026 + 2027 + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) 2028 + throtl_log(&td->service_queue, 2029 + "Latency bucket %d: latency=%ld, valid=%d", i, 2030 + td->avg_buckets[i].latency, td->avg_buckets[i].valid); 2084 2031 } 2085 2032 #else 2086 2033 static inline void throtl_update_latency_buckets(struct throtl_data *td) ··· 2417 2354 void blk_throtl_register_queue(struct request_queue *q) 2418 2355 { 2419 2356 struct throtl_data *td; 2420 - struct cgroup_subsys_state *pos_css; 2421 - struct blkcg_gq *blkg; 2422 2357 2423 2358 td = q->td; 2424 2359 BUG_ON(!td); 2425 2360 2426 - if (blk_queue_nonrot(q)) { 2361 + if (blk_queue_nonrot(q)) 2427 2362 td->throtl_slice = DFL_THROTL_SLICE_SSD; 2428 - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; 2429 - } else { 2363 + else 2430 2364 td->throtl_slice = DFL_THROTL_SLICE_HD; 2431 - td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; 2432 - } 2433 2365 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW 2434 2366 /* if no low limit, use previous default */ 2435 2367 td->throtl_slice = DFL_THROTL_SLICE_HD; ··· 2433 2375 td->track_bio_latency = !q->mq_ops && !q->request_fn; 2434 2376 if (!td->track_bio_latency) 2435 2377 blk_stat_enable_accounting(q); 2436 - 2437 - /* 2438 - * some tg are created before queue is fully initialized, eg, nonrot 2439 - * isn't initialized yet 2440 - */ 2441 - rcu_read_lock(); 2442 - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 2443 - struct throtl_grp *tg = blkg_to_tg(blkg); 2444 - 2445 - tg->idletime_threshold = td->dft_idletime_threshold; 2446 - } 2447 - rcu_read_unlock(); 2448 2378 } 2449 2379 2450 2380 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+3 -1
block/partition-generic.c
··· 320 320 321 321 if (info) { 322 322 struct partition_meta_info *pinfo = alloc_part_info(disk); 323 - if (!pinfo) 323 + if (!pinfo) { 324 + err = -ENOMEM; 324 325 goto out_free_stats; 326 + } 325 327 memcpy(pinfo, info, sizeof(*info)); 326 328 p->info = pinfo; 327 329 }
+2
block/partitions/msdos.c
··· 300 300 continue; 301 301 bsd_start = le32_to_cpu(p->p_offset); 302 302 bsd_size = le32_to_cpu(p->p_size); 303 + if (memcmp(flavour, "bsd\0", 4) == 0) 304 + bsd_start += offset; 303 305 if (offset == bsd_start && size == bsd_size) 304 306 /* full parent partition, we have it already */ 305 307 continue;
+42 -23
drivers/nvme/host/core.c
··· 925 925 } 926 926 927 927 #ifdef CONFIG_BLK_DEV_INTEGRITY 928 + static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, 929 + u16 bs) 930 + { 931 + struct nvme_ns *ns = disk->private_data; 932 + u16 old_ms = ns->ms; 933 + u8 pi_type = 0; 934 + 935 + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); 936 + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 937 + 938 + /* PI implementation requires metadata equal t10 pi tuple size */ 939 + if (ns->ms == sizeof(struct t10_pi_tuple)) 940 + pi_type = id->dps & NVME_NS_DPS_PI_MASK; 941 + 942 + if (blk_get_integrity(disk) && 943 + (ns->pi_type != pi_type || ns->ms != old_ms || 944 + bs != queue_logical_block_size(disk->queue) || 945 + (ns->ms && ns->ext))) 946 + blk_integrity_unregister(disk); 947 + 948 + ns->pi_type = pi_type; 949 + } 950 + 928 951 static void nvme_init_integrity(struct nvme_ns *ns) 929 952 { 930 953 struct blk_integrity integrity; ··· 974 951 blk_queue_max_integrity_segments(ns->queue, 1); 975 952 } 976 953 #else 954 + static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, 955 + u16 bs) 956 + { 957 + } 977 958 static void nvme_init_integrity(struct nvme_ns *ns) 978 959 { 979 960 } ··· 1024 997 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 1025 998 { 1026 999 struct nvme_ns *ns = disk->private_data; 1027 - u8 lbaf, pi_type; 1028 - u16 old_ms; 1029 - unsigned short bs; 1030 - 1031 - old_ms = ns->ms; 1032 - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 1033 - ns->lba_shift = id->lbaf[lbaf].ds; 1034 - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); 1035 - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); 1000 + u16 bs; 1036 1001 1037 1002 /* 1038 1003 * If identify namespace failed, use default 512 byte block size so 1039 1004 * block layer can use before failing read/write for 0 capacity. 1040 1005 */ 1006 + ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; 1041 1007 if (ns->lba_shift == 0) 1042 1008 ns->lba_shift = 9; 1043 1009 bs = 1 << ns->lba_shift; 1044 - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ 1045 - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? 1046 - id->dps & NVME_NS_DPS_PI_MASK : 0; 1047 1010 1048 1011 blk_mq_freeze_queue(disk->queue); 1049 - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || 1050 - ns->ms != old_ms || 1051 - bs != queue_logical_block_size(disk->queue) || 1052 - (ns->ms && ns->ext))) 1053 - blk_integrity_unregister(disk); 1054 1012 1055 - ns->pi_type = pi_type; 1013 + if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) 1014 + nvme_prep_integrity(disk, id, bs); 1056 1015 blk_queue_logical_block_size(ns->queue, bs); 1057 - 1058 1016 if (ns->ms && !blk_get_integrity(disk) && !ns->ext) 1059 1017 nvme_init_integrity(ns); 1060 1018 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) ··· 1617 1605 } 1618 1606 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); 1619 1607 1620 - if (ctrl->ops->is_fabrics) { 1608 + if (ctrl->ops->flags & NVME_F_FABRICS) { 1621 1609 ctrl->icdoff = le16_to_cpu(id->icdoff); 1622 1610 ctrl->ioccsz = le32_to_cpu(id->ioccsz); 1623 1611 ctrl->iorcsz = le32_to_cpu(id->iorcsz); ··· 2110 2098 if (ns->ndev) 2111 2099 nvme_nvm_unregister_sysfs(ns); 2112 2100 del_gendisk(ns->disk); 2113 - blk_mq_abort_requeue_list(ns->queue); 2114 2101 blk_cleanup_queue(ns->queue); 2115 2102 } 2116 2103 ··· 2447 2436 continue; 2448 2437 revalidate_disk(ns->disk); 2449 2438 blk_set_queue_dying(ns->queue); 2450 - blk_mq_abort_requeue_list(ns->queue); 2451 - blk_mq_start_stopped_hw_queues(ns->queue, true); 2439 + 2440 + /* 2441 + * Forcibly start all queues to avoid having stuck requests. 2442 + * Note that we must ensure the queues are not stopped 2443 + * when the final removal happens. 2444 + */ 2445 + blk_mq_start_hw_queues(ns->queue); 2446 + 2447 + /* draining requests in requeue list */ 2448 + blk_mq_kick_requeue_list(ns->queue); 2452 2449 } 2453 2450 mutex_unlock(&ctrl->namespaces_mutex); 2454 2451 }
+62 -89
drivers/nvme/host/fc.c
··· 45 45 46 46 #define NVMEFC_QUEUE_DELAY 3 /* ms units */ 47 47 48 - #define NVME_FC_MAX_CONNECT_ATTEMPTS 1 49 - 50 48 struct nvme_fc_queue { 51 49 struct nvme_fc_ctrl *ctrl; 52 50 struct device *dev; ··· 163 165 struct work_struct delete_work; 164 166 struct work_struct reset_work; 165 167 struct delayed_work connect_work; 166 - int reconnect_delay; 167 - int connect_attempts; 168 168 169 169 struct kref ref; 170 170 u32 flags; ··· 1372 1376 complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); 1373 1377 if (!complete_rq) { 1374 1378 if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { 1375 - status = cpu_to_le16(NVME_SC_ABORT_REQ); 1379 + status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); 1376 1380 if (blk_queue_dying(rq->q)) 1377 - status |= cpu_to_le16(NVME_SC_DNR); 1381 + status |= cpu_to_le16(NVME_SC_DNR << 1); 1378 1382 } 1379 1383 nvme_end_request(rq, status, result); 1380 1384 } else ··· 1747 1751 dev_warn(ctrl->ctrl.device, 1748 1752 "NVME-FC{%d}: transport association error detected: %s\n", 1749 1753 ctrl->cnum, errmsg); 1750 - dev_info(ctrl->ctrl.device, 1754 + dev_warn(ctrl->ctrl.device, 1751 1755 "NVME-FC{%d}: resetting controller\n", ctrl->cnum); 1752 1756 1753 1757 /* stop the queues on error, cleanup is in reset thread */ ··· 2191 2195 if (!opts->nr_io_queues) 2192 2196 return 0; 2193 2197 2194 - dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", 2195 - opts->nr_io_queues); 2196 - 2197 2198 nvme_fc_init_io_queues(ctrl); 2198 2199 2199 2200 memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); ··· 2261 2268 if (ctrl->queue_count == 1) 2262 2269 return 0; 2263 2270 2264 - dev_info(ctrl->ctrl.device, "Recreating %d I/O queues.\n", 2265 - opts->nr_io_queues); 2266 - 2267 2271 nvme_fc_init_io_queues(ctrl); 2268 2272 2269 2273 ret = blk_mq_reinit_tagset(&ctrl->tag_set); ··· 2296 2306 int ret; 2297 2307 bool changed; 2298 2308 2299 - ctrl->connect_attempts++; 2309 + ++ctrl->ctrl.opts->nr_reconnects; 2300 2310 2301 2311 /* 2302 2312 * Create the admin queue ··· 2393 2403 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 2394 2404 WARN_ON_ONCE(!changed); 2395 2405 2396 - ctrl->connect_attempts = 0; 2397 - 2398 - kref_get(&ctrl->ctrl.kref); 2406 + ctrl->ctrl.opts->nr_reconnects = 0; 2399 2407 2400 2408 if (ctrl->queue_count > 1) { 2401 2409 nvme_start_queues(&ctrl->ctrl); ··· 2524 2536 2525 2537 /* 2526 2538 * tear down the controller 2527 - * This will result in the last reference on the nvme ctrl to 2528 - * expire, calling the transport nvme_fc_nvme_ctrl_freed() callback. 2529 - * From there, the transport will tear down it's logical queues and 2530 - * association. 2539 + * After the last reference on the nvme ctrl is removed, 2540 + * the transport nvme_fc_nvme_ctrl_freed() callback will be 2541 + * invoked. From there, the transport will tear down it's 2542 + * logical queues and association. 2531 2543 */ 2532 2544 nvme_uninit_ctrl(&ctrl->ctrl); 2533 2545 2534 2546 nvme_put_ctrl(&ctrl->ctrl); 2535 2547 } 2536 2548 2549 + static bool 2550 + __nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) 2551 + { 2552 + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) 2553 + return true; 2554 + 2555 + if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) 2556 + return true; 2557 + 2558 + return false; 2559 + } 2560 + 2537 2561 static int 2538 2562 __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) 2539 2563 { 2540 - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) 2541 - return -EBUSY; 2542 - 2543 - if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) 2544 - return -EBUSY; 2545 - 2546 - return 0; 2564 + return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; 2547 2565 } 2548 2566 2549 2567 /* ··· 2575 2581 } 2576 2582 2577 2583 static void 2584 + nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) 2585 + { 2586 + /* If we are resetting/deleting then do nothing */ 2587 + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { 2588 + WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || 2589 + ctrl->ctrl.state == NVME_CTRL_LIVE); 2590 + return; 2591 + } 2592 + 2593 + dev_info(ctrl->ctrl.device, 2594 + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", 2595 + ctrl->cnum, status); 2596 + 2597 + if (nvmf_should_reconnect(&ctrl->ctrl)) { 2598 + dev_info(ctrl->ctrl.device, 2599 + "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", 2600 + ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); 2601 + queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, 2602 + ctrl->ctrl.opts->reconnect_delay * HZ); 2603 + } else { 2604 + dev_warn(ctrl->ctrl.device, 2605 + "NVME-FC{%d}: Max reconnect attempts (%d) " 2606 + "reached. Removing controller\n", 2607 + ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); 2608 + WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); 2609 + } 2610 + } 2611 + 2612 + static void 2578 2613 nvme_fc_reset_ctrl_work(struct work_struct *work) 2579 2614 { 2580 2615 struct nvme_fc_ctrl *ctrl = ··· 2614 2591 nvme_fc_delete_association(ctrl); 2615 2592 2616 2593 ret = nvme_fc_create_association(ctrl); 2617 - if (ret) { 2618 - dev_warn(ctrl->ctrl.device, 2619 - "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", 2620 - ctrl->cnum, ret); 2621 - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { 2622 - dev_warn(ctrl->ctrl.device, 2623 - "NVME-FC{%d}: Max reconnect attempts (%d) " 2624 - "reached. Removing controller\n", 2625 - ctrl->cnum, ctrl->connect_attempts); 2626 - 2627 - if (!nvme_change_ctrl_state(&ctrl->ctrl, 2628 - NVME_CTRL_DELETING)) { 2629 - dev_err(ctrl->ctrl.device, 2630 - "NVME-FC{%d}: failed to change state " 2631 - "to DELETING\n", ctrl->cnum); 2632 - return; 2633 - } 2634 - 2635 - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); 2636 - return; 2637 - } 2638 - 2639 - dev_warn(ctrl->ctrl.device, 2640 - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", 2641 - ctrl->cnum, ctrl->reconnect_delay); 2642 - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, 2643 - ctrl->reconnect_delay * HZ); 2644 - } else 2594 + if (ret) 2595 + nvme_fc_reconnect_or_delete(ctrl, ret); 2596 + else 2645 2597 dev_info(ctrl->ctrl.device, 2646 2598 "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); 2647 2599 } ··· 2630 2632 { 2631 2633 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); 2632 2634 2633 - dev_warn(ctrl->ctrl.device, 2635 + dev_info(ctrl->ctrl.device, 2634 2636 "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); 2635 2637 2636 2638 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) ··· 2647 2649 static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { 2648 2650 .name = "fc", 2649 2651 .module = THIS_MODULE, 2650 - .is_fabrics = true, 2652 + .flags = NVME_F_FABRICS, 2651 2653 .reg_read32 = nvmf_reg_read32, 2652 2654 .reg_read64 = nvmf_reg_read64, 2653 2655 .reg_write32 = nvmf_reg_write32, ··· 2669 2671 struct nvme_fc_ctrl, connect_work); 2670 2672 2671 2673 ret = nvme_fc_create_association(ctrl); 2672 - if (ret) { 2673 - dev_warn(ctrl->ctrl.device, 2674 - "NVME-FC{%d}: Reconnect attempt failed (%d)\n", 2675 - ctrl->cnum, ret); 2676 - if (ctrl->connect_attempts >= NVME_FC_MAX_CONNECT_ATTEMPTS) { 2677 - dev_warn(ctrl->ctrl.device, 2678 - "NVME-FC{%d}: Max reconnect attempts (%d) " 2679 - "reached. Removing controller\n", 2680 - ctrl->cnum, ctrl->connect_attempts); 2681 - 2682 - if (!nvme_change_ctrl_state(&ctrl->ctrl, 2683 - NVME_CTRL_DELETING)) { 2684 - dev_err(ctrl->ctrl.device, 2685 - "NVME-FC{%d}: failed to change state " 2686 - "to DELETING\n", ctrl->cnum); 2687 - return; 2688 - } 2689 - 2690 - WARN_ON(!queue_work(nvme_fc_wq, &ctrl->delete_work)); 2691 - return; 2692 - } 2693 - 2694 - dev_warn(ctrl->ctrl.device, 2695 - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", 2696 - ctrl->cnum, ctrl->reconnect_delay); 2697 - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, 2698 - ctrl->reconnect_delay * HZ); 2699 - } else 2674 + if (ret) 2675 + nvme_fc_reconnect_or_delete(ctrl, ret); 2676 + else 2700 2677 dev_info(ctrl->ctrl.device, 2701 2678 "NVME-FC{%d}: controller reconnect complete\n", 2702 2679 ctrl->cnum); ··· 2728 2755 INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); 2729 2756 INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); 2730 2757 INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); 2731 - ctrl->reconnect_delay = opts->reconnect_delay; 2732 2758 spin_lock_init(&ctrl->lock); 2733 2759 2734 2760 /* io queue count */ ··· 2791 2819 ctrl->ctrl.opts = NULL; 2792 2820 /* initiate nvme ctrl ref counting teardown */ 2793 2821 nvme_uninit_ctrl(&ctrl->ctrl); 2794 - nvme_put_ctrl(&ctrl->ctrl); 2795 2822 2796 2823 /* as we're past the point where we transition to the ref 2797 2824 * counting teardown path, if we return a bad pointer here, ··· 2805 2834 ret = -EIO; 2806 2835 return ERR_PTR(ret); 2807 2836 } 2837 + 2838 + kref_get(&ctrl->ctrl.kref); 2808 2839 2809 2840 dev_info(ctrl->ctrl.device, 2810 2841 "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", ··· 2944 2971 static struct nvmf_transport_ops nvme_fc_transport = { 2945 2972 .name = "fc", 2946 2973 .required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR, 2947 - .allowed_opts = NVMF_OPT_RECONNECT_DELAY, 2974 + .allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO, 2948 2975 .create_ctrl = nvme_fc_create_ctrl, 2949 2976 }; 2950 2977
+3 -1
drivers/nvme/host/nvme.h
··· 208 208 struct nvme_ctrl_ops { 209 209 const char *name; 210 210 struct module *module; 211 - bool is_fabrics; 211 + unsigned int flags; 212 + #define NVME_F_FABRICS (1 << 0) 213 + #define NVME_F_METADATA_SUPPORTED (1 << 1) 212 214 int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); 213 215 int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); 214 216 int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+9 -6
drivers/nvme/host/pci.c
··· 263 263 c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr); 264 264 265 265 if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) { 266 - dev_warn(dev->dev, "unable to set dbbuf\n"); 266 + dev_warn(dev->ctrl.device, "unable to set dbbuf\n"); 267 267 /* Free memory and continue on */ 268 268 nvme_dbbuf_dma_free(dev); 269 269 } ··· 1394 1394 result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, 1395 1395 &pci_status); 1396 1396 if (result == PCIBIOS_SUCCESSFUL) 1397 - dev_warn(dev->dev, 1397 + dev_warn(dev->ctrl.device, 1398 1398 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", 1399 1399 csts, pci_status); 1400 1400 else 1401 - dev_warn(dev->dev, 1401 + dev_warn(dev->ctrl.device, 1402 1402 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", 1403 1403 csts, result); 1404 1404 } ··· 1740 1740 */ 1741 1741 if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) { 1742 1742 dev->q_depth = 2; 1743 - dev_warn(dev->dev, "detected Apple NVMe controller, set " 1744 - "queue depth=%u to work around controller resets\n", 1743 + dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " 1744 + "set queue depth=%u to work around controller resets\n", 1745 1745 dev->q_depth); 1746 1746 } 1747 1747 ··· 1759 1759 if (dev->cmbsz) { 1760 1760 if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, 1761 1761 &dev_attr_cmb.attr, NULL)) 1762 - dev_warn(dev->dev, 1762 + dev_warn(dev->ctrl.device, 1763 1763 "failed to add sysfs attribute for CMB\n"); 1764 1764 } 1765 1765 } ··· 2047 2047 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { 2048 2048 .name = "pcie", 2049 2049 .module = THIS_MODULE, 2050 + .flags = NVME_F_METADATA_SUPPORTED, 2050 2051 .reg_read32 = nvme_pci_reg_read32, 2051 2052 .reg_write32 = nvme_pci_reg_write32, 2052 2053 .reg_read64 = nvme_pci_reg_read64, ··· 2294 2293 { PCI_VDEVICE(INTEL, 0x0a54), 2295 2294 .driver_data = NVME_QUIRK_STRIPE_SIZE | 2296 2295 NVME_QUIRK_DEALLOCATE_ZEROES, }, 2296 + { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ 2297 + .driver_data = NVME_QUIRK_NO_DEEPEST_PS }, 2297 2298 { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ 2298 2299 .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, 2299 2300 { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
+15 -5
drivers/nvme/host/rdma.c
··· 1038 1038 nvme_rdma_wr_error(cq, wc, "SEND"); 1039 1039 } 1040 1040 1041 + static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) 1042 + { 1043 + int sig_limit; 1044 + 1045 + /* 1046 + * We signal completion every queue depth/2 and also handle the 1047 + * degenerated case of a device with queue_depth=1, where we 1048 + * would need to signal every message. 1049 + */ 1050 + sig_limit = max(queue->queue_size / 2, 1); 1051 + return (++queue->sig_count % sig_limit) == 0; 1052 + } 1053 + 1041 1054 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, 1042 1055 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge, 1043 1056 struct ib_send_wr *first, bool flush) ··· 1078 1065 * Would have been way to obvious to handle this in hardware or 1079 1066 * at least the RDMA stack.. 1080 1067 * 1081 - * This messy and racy code sniplet is copy and pasted from the iSER 1082 - * initiator, and the magic '32' comes from there as well. 1083 - * 1084 1068 * Always signal the flushes. The magic request used for the flush 1085 1069 * sequencer is not allocated in our driver's tagset and it's 1086 1070 * triggered to be freed by blk_cleanup_queue(). So we need to ··· 1085 1075 * embedded in request's payload, is not freed when __ib_process_cq() 1086 1076 * calls wr_cqe->done(). 1087 1077 */ 1088 - if ((++queue->sig_count % 32) == 0 || flush) 1078 + if (nvme_rdma_queue_sig_limit(queue) || flush) 1089 1079 wr.send_flags |= IB_SEND_SIGNALED; 1090 1080 1091 1081 if (first) ··· 1792 1782 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { 1793 1783 .name = "rdma", 1794 1784 .module = THIS_MODULE, 1795 - .is_fabrics = true, 1785 + .flags = NVME_F_FABRICS, 1796 1786 .reg_read32 = nvmf_reg_read32, 1797 1787 .reg_read64 = nvmf_reg_read64, 1798 1788 .reg_write32 = nvmf_reg_write32,
+1 -1
drivers/nvme/target/loop.c
··· 558 558 static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { 559 559 .name = "loop", 560 560 .module = THIS_MODULE, 561 - .is_fabrics = true, 561 + .flags = NVME_F_FABRICS, 562 562 .reg_read32 = nvmf_reg_read32, 563 563 .reg_read64 = nvmf_reg_read64, 564 564 .reg_write32 = nvmf_reg_write32,
-1
include/linux/blk-mq.h
··· 238 238 bool kick_requeue_list); 239 239 void blk_mq_kick_requeue_list(struct request_queue *q); 240 240 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 241 - void blk_mq_abort_requeue_list(struct request_queue *q); 242 241 void blk_mq_complete_request(struct request *rq); 243 242 244 243 bool blk_mq_queue_stopped(struct request_queue *q);