Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

- an NVMe fix from Gabriel, fixing a suspend/resume issue on some
setups

- addition of a few missing entries in the block queue sysfs
documentation, from Joe

- a fix for a sparse shadow warning for the bvec iterator, from
Johannes

- a writeback deadlock involving raid issuing barriers, and not
flushing the plug when we wakeup the flusher threads. From
Konstantin

- a set of patches for the NVMe target/loop/rdma code, from Roland and
Sagi

* 'for-linus' of git://git.kernel.dk/linux-block:
bvec: avoid variable shadowing warning
doc: update block/queue-sysfs.txt entries
nvme: Suspend all queues before deletion
mm, writeback: flush plugged IO in wakeup_flusher_threads()
nvme-rdma: Remove unused includes
nvme-rdma: start async event handler after reconnecting to a controller
nvmet: Fix controller serial number inconsistency
nvmet-rdma: Don't use the inline buffer in order to avoid allocation for small reads
nvmet-rdma: Correctly handle RDMA device hot removal
nvme-rdma: Make sure to shutdown the controller if we can
nvme-loop: Remove duplicate call to nvme_remove_namespaces
nvme-rdma: Free the I/O tags when we delete the controller
nvme-rdma: Remove duplicate call to nvme_remove_namespaces
nvme-rdma: Fix device removal handling
nvme-rdma: Queue ns scanning after a sucessful reconnection
nvme-rdma: Don't leak uninitialized memory in connect request private data

+162 -87
+18
Documentation/block/queue-sysfs.txt
··· 14 14 This file allows to turn off the disk entropy contribution. Default 15 15 value of this file is '1'(on). 16 16 17 + dax (RO) 18 + -------- 19 + This file indicates whether the device supports Direct Access (DAX), 20 + used by CPU-addressable storage to bypass the pagecache. It shows '1' 21 + if true, '0' if not. 22 + 17 23 discard_granularity (RO) 18 24 ----------------------- 19 25 This shows the size of internal allocation of the device in bytes, if ··· 51 45 hw_sector_size (RO) 52 46 ------------------- 53 47 This is the hardware sector size of the device, in bytes. 48 + 49 + io_poll (RW) 50 + ------------ 51 + When read, this file shows the total number of block IO polls and how 52 + many returned success. Writing '0' to this file will disable polling 53 + for this device. Writing any non-zero value will enable this feature. 54 54 55 55 iostats (RW) 56 56 ------------- ··· 162 150 device state. This means that it might not be safe to toggle the 163 151 setting from "write back" to "write through", since that will also 164 152 eliminate cache flushes issued by the kernel. 153 + 154 + write_same_max_bytes (RO) 155 + ------------------------- 156 + This is the number of bytes the device can write in a single write-same 157 + command. A value of '0' means write-same is not supported by this 158 + device. 165 159 166 160 167 161 Jens Axboe <jens.axboe@oracle.com>, February 2009
+8 -12
drivers/nvme/host/pci.c
··· 1543 1543 reinit_completion(&dev->ioq_wait); 1544 1544 retry: 1545 1545 timeout = ADMIN_TIMEOUT; 1546 - for (; i > 0; i--) { 1547 - struct nvme_queue *nvmeq = dev->queues[i]; 1548 - 1549 - if (!pass) 1550 - nvme_suspend_queue(nvmeq); 1551 - if (nvme_delete_queue(nvmeq, opcode)) 1546 + for (; i > 0; i--, sent++) 1547 + if (nvme_delete_queue(dev->queues[i], opcode)) 1552 1548 break; 1553 - ++sent; 1554 - } 1549 + 1555 1550 while (sent--) { 1556 1551 timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); 1557 1552 if (timeout == 0) ··· 1688 1693 nvme_stop_queues(&dev->ctrl); 1689 1694 csts = readl(dev->bar + NVME_REG_CSTS); 1690 1695 } 1696 + 1697 + for (i = dev->queue_count - 1; i > 0; i--) 1698 + nvme_suspend_queue(dev->queues[i]); 1699 + 1691 1700 if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { 1692 - for (i = dev->queue_count - 1; i >= 0; i--) { 1693 - struct nvme_queue *nvmeq = dev->queues[i]; 1694 - nvme_suspend_queue(nvmeq); 1695 - } 1701 + nvme_suspend_queue(dev->queues[0]); 1696 1702 } else { 1697 1703 nvme_disable_io_queues(dev); 1698 1704 nvme_disable_admin_queue(dev, shutdown);
+47 -40
drivers/nvme/host/rdma.c
··· 12 12 * more details. 13 13 */ 14 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 - #include <linux/delay.h> 16 15 #include <linux/module.h> 17 16 #include <linux/init.h> 18 17 #include <linux/slab.h> 19 18 #include <linux/err.h> 20 19 #include <linux/string.h> 21 - #include <linux/jiffies.h> 22 20 #include <linux/atomic.h> 23 21 #include <linux/blk-mq.h> 24 22 #include <linux/types.h> ··· 24 26 #include <linux/mutex.h> 25 27 #include <linux/scatterlist.h> 26 28 #include <linux/nvme.h> 27 - #include <linux/t10-pi.h> 28 29 #include <asm/unaligned.h> 29 30 30 31 #include <rdma/ib_verbs.h> ··· 166 169 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, 167 170 struct rdma_cm_event *event); 168 171 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 169 - static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl); 170 172 171 173 /* XXX: really should move to a generic header sooner or later.. */ 172 174 static inline void put_unaligned_le24(u32 val, u8 *p) ··· 683 687 list_del(&ctrl->list); 684 688 mutex_unlock(&nvme_rdma_ctrl_mutex); 685 689 686 - if (ctrl->ctrl.tagset) { 687 - blk_cleanup_queue(ctrl->ctrl.connect_q); 688 - blk_mq_free_tag_set(&ctrl->tag_set); 689 - nvme_rdma_dev_put(ctrl->device); 690 - } 691 690 kfree(ctrl->queues); 692 691 nvmf_free_options(nctrl->opts); 693 692 free_ctrl: ··· 739 748 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 740 749 WARN_ON_ONCE(!changed); 741 750 742 - if (ctrl->queue_count > 1) 751 + if (ctrl->queue_count > 1) { 743 752 nvme_start_queues(&ctrl->ctrl); 753 + nvme_queue_scan(&ctrl->ctrl); 754 + nvme_queue_async_events(&ctrl->ctrl); 755 + } 744 756 745 757 dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); 746 758 ··· 1263 1269 { 1264 1270 struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1265 1271 struct rdma_conn_param param = { }; 1266 - struct nvme_rdma_cm_req priv; 1272 + struct nvme_rdma_cm_req priv = { }; 1267 1273 int ret; 1268 1274 1269 1275 param.qp_num = queue->qp->qp_num; ··· 1312 1318 * that caught the event. Since we hold the callout until the controller 1313 1319 * deletion is completed, we'll deadlock if the controller deletion will 1314 1320 * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership 1315 - * of destroying this queue before-hand, destroy the queue resources 1316 - * after the controller deletion completed with the exception of destroying 1317 - * the cm_id implicitely by returning a non-zero rc to the callout. 1321 + * of destroying this queue before-hand, destroy the queue resources, 1322 + * then queue the controller deletion which won't destroy this queue and 1323 + * we destroy the cm_id implicitely by returning a non-zero rc to the callout. 1318 1324 */ 1319 1325 static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue) 1320 1326 { 1321 1327 struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1322 - int ret, ctrl_deleted = 0; 1328 + int ret; 1323 1329 1324 - /* First disable the queue so ctrl delete won't free it */ 1325 - if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) 1326 - goto out; 1330 + /* Own the controller deletion */ 1331 + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) 1332 + return 0; 1327 1333 1328 - /* delete the controller */ 1329 - ret = __nvme_rdma_del_ctrl(ctrl); 1330 - if (!ret) { 1331 - dev_warn(ctrl->ctrl.device, 1332 - "Got rdma device removal event, deleting ctrl\n"); 1333 - flush_work(&ctrl->delete_work); 1334 + dev_warn(ctrl->ctrl.device, 1335 + "Got rdma device removal event, deleting ctrl\n"); 1336 + 1337 + /* Get rid of reconnect work if its running */ 1338 + cancel_delayed_work_sync(&ctrl->reconnect_work); 1339 + 1340 + /* Disable the queue so ctrl delete won't free it */ 1341 + if (test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) { 1342 + /* Free this queue ourselves */ 1343 + nvme_rdma_stop_queue(queue); 1344 + nvme_rdma_destroy_queue_ib(queue); 1334 1345 1335 1346 /* Return non-zero so the cm_id will destroy implicitly */ 1336 - ctrl_deleted = 1; 1337 - 1338 - /* Free this queue ourselves */ 1339 - rdma_disconnect(queue->cm_id); 1340 - ib_drain_qp(queue->qp); 1341 - nvme_rdma_destroy_queue_ib(queue); 1347 + ret = 1; 1342 1348 } 1343 1349 1344 - out: 1345 - return ctrl_deleted; 1350 + /* Queue controller deletion */ 1351 + queue_work(nvme_rdma_wq, &ctrl->delete_work); 1352 + flush_work(&ctrl->delete_work); 1353 + return ret; 1346 1354 } 1347 1355 1348 1356 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, ··· 1644 1648 nvme_rdma_free_io_queues(ctrl); 1645 1649 } 1646 1650 1647 - if (ctrl->ctrl.state == NVME_CTRL_LIVE) 1651 + if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags)) 1648 1652 nvme_shutdown_ctrl(&ctrl->ctrl); 1649 1653 1650 1654 blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); ··· 1653 1657 nvme_rdma_destroy_admin_queue(ctrl); 1654 1658 } 1655 1659 1660 + static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) 1661 + { 1662 + nvme_uninit_ctrl(&ctrl->ctrl); 1663 + if (shutdown) 1664 + nvme_rdma_shutdown_ctrl(ctrl); 1665 + 1666 + if (ctrl->ctrl.tagset) { 1667 + blk_cleanup_queue(ctrl->ctrl.connect_q); 1668 + blk_mq_free_tag_set(&ctrl->tag_set); 1669 + nvme_rdma_dev_put(ctrl->device); 1670 + } 1671 + 1672 + nvme_put_ctrl(&ctrl->ctrl); 1673 + } 1674 + 1656 1675 static void nvme_rdma_del_ctrl_work(struct work_struct *work) 1657 1676 { 1658 1677 struct nvme_rdma_ctrl *ctrl = container_of(work, 1659 1678 struct nvme_rdma_ctrl, delete_work); 1660 1679 1661 - nvme_remove_namespaces(&ctrl->ctrl); 1662 - nvme_rdma_shutdown_ctrl(ctrl); 1663 - nvme_uninit_ctrl(&ctrl->ctrl); 1664 - nvme_put_ctrl(&ctrl->ctrl); 1680 + __nvme_rdma_remove_ctrl(ctrl, true); 1665 1681 } 1666 1682 1667 1683 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) ··· 1706 1698 struct nvme_rdma_ctrl *ctrl = container_of(work, 1707 1699 struct nvme_rdma_ctrl, delete_work); 1708 1700 1709 - nvme_remove_namespaces(&ctrl->ctrl); 1710 - nvme_uninit_ctrl(&ctrl->ctrl); 1711 - nvme_put_ctrl(&ctrl->ctrl); 1701 + __nvme_rdma_remove_ctrl(ctrl, false); 1712 1702 } 1713 1703 1714 1704 static void nvme_rdma_reset_ctrl_work(struct work_struct *work) ··· 1745 1739 if (ctrl->queue_count > 1) { 1746 1740 nvme_start_queues(&ctrl->ctrl); 1747 1741 nvme_queue_scan(&ctrl->ctrl); 1742 + nvme_queue_async_events(&ctrl->ctrl); 1748 1743 } 1749 1744 1750 1745 return;
+1 -5
drivers/nvme/target/admin-cmd.c
··· 13 13 */ 14 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 15 #include <linux/module.h> 16 - #include <linux/random.h> 17 16 #include <generated/utsrelease.h> 18 17 #include "nvmet.h" 19 18 ··· 82 83 { 83 84 struct nvmet_ctrl *ctrl = req->sq->ctrl; 84 85 struct nvme_id_ctrl *id; 85 - u64 serial; 86 86 u16 status = 0; 87 87 88 88 id = kzalloc(sizeof(*id), GFP_KERNEL); ··· 94 96 id->vid = 0; 95 97 id->ssvid = 0; 96 98 97 - /* generate a random serial number as our controllers are ephemeral: */ 98 - get_random_bytes(&serial, sizeof(serial)); 99 99 memset(id->sn, ' ', sizeof(id->sn)); 100 - snprintf(id->sn, sizeof(id->sn), "%llx", serial); 100 + snprintf(id->sn, sizeof(id->sn), "%llx", ctrl->serial); 101 101 102 102 memset(id->mn, ' ', sizeof(id->mn)); 103 103 strncpy((char *)id->mn, "Linux", sizeof(id->mn));
+4
drivers/nvme/target/core.c
··· 13 13 */ 14 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 15 #include <linux/module.h> 16 + #include <linux/random.h> 16 17 #include "nvmet.h" 17 18 18 19 static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; ··· 728 727 729 728 memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); 730 729 memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); 730 + 731 + /* generate a random serial number as our controllers are ephemeral: */ 732 + get_random_bytes(&ctrl->serial, sizeof(ctrl->serial)); 731 733 732 734 kref_init(&ctrl->ref); 733 735 ctrl->subsys = subsys;
+1 -3
drivers/nvme/target/loop.c
··· 414 414 struct nvme_loop_ctrl *ctrl = container_of(work, 415 415 struct nvme_loop_ctrl, delete_work); 416 416 417 - nvme_remove_namespaces(&ctrl->ctrl); 418 - nvme_loop_shutdown_ctrl(ctrl); 419 417 nvme_uninit_ctrl(&ctrl->ctrl); 418 + nvme_loop_shutdown_ctrl(ctrl); 420 419 nvme_put_ctrl(&ctrl->ctrl); 421 420 } 422 421 ··· 500 501 nvme_loop_destroy_admin_queue(ctrl); 501 502 out_disable: 502 503 dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); 503 - nvme_remove_namespaces(&ctrl->ctrl); 504 504 nvme_uninit_ctrl(&ctrl->ctrl); 505 505 nvme_put_ctrl(&ctrl->ctrl); 506 506 }
+1
drivers/nvme/target/nvmet.h
··· 113 113 114 114 struct mutex lock; 115 115 u64 cap; 116 + u64 serial; 116 117 u32 cc; 117 118 u32 csts; 118 119
+74 -26
drivers/nvme/target/rdma.c
··· 77 77 NVMET_RDMA_Q_CONNECTING, 78 78 NVMET_RDMA_Q_LIVE, 79 79 NVMET_RDMA_Q_DISCONNECTING, 80 + NVMET_RDMA_IN_DEVICE_REMOVAL, 80 81 }; 81 82 82 83 struct nvmet_rdma_queue { ··· 616 615 if (!len) 617 616 return 0; 618 617 619 - /* use the already allocated data buffer if possible */ 620 - if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) { 621 - nvmet_rdma_use_inline_sg(rsp, len, 0); 622 - } else { 623 - status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, 624 - len); 625 - if (status) 626 - return status; 627 - } 618 + status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, 619 + len); 620 + if (status) 621 + return status; 628 622 629 623 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 630 624 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, ··· 980 984 struct nvmet_rdma_device *dev = queue->dev; 981 985 982 986 nvmet_rdma_free_queue(queue); 983 - rdma_destroy_id(cm_id); 987 + 988 + if (queue->state != NVMET_RDMA_IN_DEVICE_REMOVAL) 989 + rdma_destroy_id(cm_id); 990 + 984 991 kref_put(&dev->ref, nvmet_rdma_free_dev); 985 992 } 986 993 ··· 1232 1233 switch (queue->state) { 1233 1234 case NVMET_RDMA_Q_CONNECTING: 1234 1235 case NVMET_RDMA_Q_LIVE: 1235 - disconnect = true; 1236 1236 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1237 + case NVMET_RDMA_IN_DEVICE_REMOVAL: 1238 + disconnect = true; 1237 1239 break; 1238 1240 case NVMET_RDMA_Q_DISCONNECTING: 1239 1241 break; ··· 1272 1272 schedule_work(&queue->release_work); 1273 1273 } 1274 1274 1275 + /** 1276 + * nvme_rdma_device_removal() - Handle RDMA device removal 1277 + * @queue: nvmet rdma queue (cm id qp_context) 1278 + * @addr: nvmet address (cm_id context) 1279 + * 1280 + * DEVICE_REMOVAL event notifies us that the RDMA device is about 1281 + * to unplug so we should take care of destroying our RDMA resources. 1282 + * This event will be generated for each allocated cm_id. 1283 + * 1284 + * Note that this event can be generated on a normal queue cm_id 1285 + * and/or a device bound listener cm_id (where in this case 1286 + * queue will be null). 1287 + * 1288 + * we claim ownership on destroying the cm_id. For queues we move 1289 + * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port 1290 + * we nullify the priv to prevent double cm_id destruction and destroying 1291 + * the cm_id implicitely by returning a non-zero rc to the callout. 1292 + */ 1293 + static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1294 + struct nvmet_rdma_queue *queue) 1295 + { 1296 + unsigned long flags; 1297 + 1298 + if (!queue) { 1299 + struct nvmet_port *port = cm_id->context; 1300 + 1301 + /* 1302 + * This is a listener cm_id. Make sure that 1303 + * future remove_port won't invoke a double 1304 + * cm_id destroy. use atomic xchg to make sure 1305 + * we don't compete with remove_port. 1306 + */ 1307 + if (xchg(&port->priv, NULL) != cm_id) 1308 + return 0; 1309 + } else { 1310 + /* 1311 + * This is a queue cm_id. Make sure that 1312 + * release queue will not destroy the cm_id 1313 + * and schedule all ctrl queues removal (only 1314 + * if the queue is not disconnecting already). 1315 + */ 1316 + spin_lock_irqsave(&queue->state_lock, flags); 1317 + if (queue->state != NVMET_RDMA_Q_DISCONNECTING) 1318 + queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; 1319 + spin_unlock_irqrestore(&queue->state_lock, flags); 1320 + nvmet_rdma_queue_disconnect(queue); 1321 + flush_scheduled_work(); 1322 + } 1323 + 1324 + /* 1325 + * We need to return 1 so that the core will destroy 1326 + * it's own ID. What a great API design.. 1327 + */ 1328 + return 1; 1329 + } 1330 + 1275 1331 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1276 1332 struct rdma_cm_event *event) 1277 1333 { ··· 1350 1294 break; 1351 1295 case RDMA_CM_EVENT_ADDR_CHANGE: 1352 1296 case RDMA_CM_EVENT_DISCONNECTED: 1353 - case RDMA_CM_EVENT_DEVICE_REMOVAL: 1354 1297 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1355 - /* 1356 - * We can get the device removal callback even for a 1357 - * CM ID that we aren't actually using. In that case 1358 - * the context pointer is NULL, so we shouldn't try 1359 - * to disconnect a non-existing queue. But we also 1360 - * need to return 1 so that the core will destroy 1361 - * it's own ID. What a great API design.. 1362 - */ 1363 - if (queue) 1364 - nvmet_rdma_queue_disconnect(queue); 1365 - else 1366 - ret = 1; 1298 + nvmet_rdma_queue_disconnect(queue); 1299 + break; 1300 + case RDMA_CM_EVENT_DEVICE_REMOVAL: 1301 + ret = nvmet_rdma_device_removal(cm_id, queue); 1367 1302 break; 1368 1303 case RDMA_CM_EVENT_REJECTED: 1369 1304 case RDMA_CM_EVENT_UNREACHABLE: ··· 1443 1396 1444 1397 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1445 1398 { 1446 - struct rdma_cm_id *cm_id = port->priv; 1399 + struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1447 1400 1448 - rdma_destroy_id(cm_id); 1401 + if (cm_id) 1402 + rdma_destroy_id(cm_id); 1449 1403 } 1450 1404 1451 1405 static struct nvmet_fabrics_ops nvmet_rdma_ops = {
+6
fs/fs-writeback.c
··· 1949 1949 { 1950 1950 struct backing_dev_info *bdi; 1951 1951 1952 + /* 1953 + * If we are expecting writeback progress we must submit plugged IO. 1954 + */ 1955 + if (blk_needs_flush_plug(current)) 1956 + blk_schedule_flush_plug(current); 1957 + 1952 1958 if (!nr_pages) 1953 1959 nr_pages = get_nr_dirty_pages(); 1954 1960
+2 -1
include/linux/bvec.h
··· 74 74 "Attempted to advance past end of bvec iter\n"); 75 75 76 76 while (bytes) { 77 - unsigned len = min(bytes, bvec_iter_len(bv, *iter)); 77 + unsigned iter_len = bvec_iter_len(bv, *iter); 78 + unsigned len = min(bytes, iter_len); 78 79 79 80 bytes -= len; 80 81 iter->bi_size -= len;