Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus2' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
"Round 2 of this. I cut back to the bare necessities, the patch is
still larger than it usually would be at this time, due to the number
of NVMe fixes in there. This pull request contains:

- The 4 core fixes from Ming, that fix both problems with exceeding
the virtual boundary limit in case of merging, and the gap checking
for cloned bio's.

- NVMe fixes from Keith and Christoph:

- Regression on larger user commands, causing problems with
reading log pages (for instance). This touches both NVMe,
and the block core since that is now generally utilized also
for these types of commands.

- Hot removal fixes.

- User exploitable issue with passthrough IO commands, if !length
is given, causing us to fault on writing to the zero
page.

- Fix for a hang under error conditions

- And finally, the current series regression for umount with cgroup
writeback, where the final flush would happen async and hence open
up window after umount where the device wasn't consistent. fsck
right after umount would show this. From Tejun"

* 'for-linus2' of git://git.kernel.dk/linux-block:
block: support large requests in blk_rq_map_user_iov
block: fix blk_rq_get_max_sectors for driver private requests
nvme: fix max_segments integer truncation
nvme: set queue limits for the admin queue
writeback: flush inode cgroup wb switches instead of pinning super_block
NVMe: Fix 0-length integrity payload
NVMe: Don't allow unsupported flags
NVMe: Move error handling to failed reset handler
NVMe: Simplify device reset failure
NVMe: Fix namespace removal deadlock
NVMe: Use IDA for namespace disk naming
NVMe: Don't unmap controller registers on reset
block: merge: get the 1st and last bvec via helpers
block: get the 1st and last bvec via helpers
block: check virt boundary in bio_will_gap()
block: bio: introduce helpers to get the 1st and last bvec

+341 -148
+61 -30
block/blk-map.c
··· 57 57 return ret; 58 58 } 59 59 60 + static int __blk_rq_map_user_iov(struct request *rq, 61 + struct rq_map_data *map_data, struct iov_iter *iter, 62 + gfp_t gfp_mask, bool copy) 63 + { 64 + struct request_queue *q = rq->q; 65 + struct bio *bio, *orig_bio; 66 + int ret; 67 + 68 + if (copy) 69 + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); 70 + else 71 + bio = bio_map_user_iov(q, iter, gfp_mask); 72 + 73 + if (IS_ERR(bio)) 74 + return PTR_ERR(bio); 75 + 76 + if (map_data && map_data->null_mapped) 77 + bio_set_flag(bio, BIO_NULL_MAPPED); 78 + 79 + iov_iter_advance(iter, bio->bi_iter.bi_size); 80 + if (map_data) 81 + map_data->offset += bio->bi_iter.bi_size; 82 + 83 + orig_bio = bio; 84 + blk_queue_bounce(q, &bio); 85 + 86 + /* 87 + * We link the bounce buffer in and could have to traverse it 88 + * later so we have to get a ref to prevent it from being freed 89 + */ 90 + bio_get(bio); 91 + 92 + ret = blk_rq_append_bio(q, rq, bio); 93 + if (ret) { 94 + bio_endio(bio); 95 + __blk_rq_unmap_user(orig_bio); 96 + bio_put(bio); 97 + return ret; 98 + } 99 + 100 + return 0; 101 + } 102 + 60 103 /** 61 104 * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage 62 105 * @q: request queue where request should be inserted ··· 125 82 struct rq_map_data *map_data, 126 83 const struct iov_iter *iter, gfp_t gfp_mask) 127 84 { 128 - struct bio *bio; 129 - int unaligned = 0; 130 - struct iov_iter i; 131 85 struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; 86 + bool copy = (q->dma_pad_mask & iter->count) || map_data; 87 + struct bio *bio = NULL; 88 + struct iov_iter i; 89 + int ret; 132 90 133 91 if (!iter || !iter->count) 134 92 return -EINVAL; ··· 145 101 */ 146 102 if ((uaddr & queue_dma_alignment(q)) || 147 103 iovec_gap_to_prv(q, &prv, &iov)) 148 - unaligned = 1; 104 + copy = true; 149 105 150 106 prv.iov_base = iov.iov_base; 151 107 prv.iov_len = iov.iov_len; 152 108 } 153 109 154 - if (unaligned || (q->dma_pad_mask & iter->count) || map_data) 155 - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); 156 - else 157 - bio = bio_map_user_iov(q, iter, gfp_mask); 158 - 159 - if (IS_ERR(bio)) 160 - return PTR_ERR(bio); 161 - 162 - if (map_data && map_data->null_mapped) 163 - bio_set_flag(bio, BIO_NULL_MAPPED); 164 - 165 - if (bio->bi_iter.bi_size != iter->count) { 166 - /* 167 - * Grab an extra reference to this bio, as bio_unmap_user() 168 - * expects to be able to drop it twice as it happens on the 169 - * normal IO completion path 170 - */ 171 - bio_get(bio); 172 - bio_endio(bio); 173 - __blk_rq_unmap_user(bio); 174 - return -EINVAL; 175 - } 110 + i = *iter; 111 + do { 112 + ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); 113 + if (ret) 114 + goto unmap_rq; 115 + if (!bio) 116 + bio = rq->bio; 117 + } while (iov_iter_count(&i)); 176 118 177 119 if (!bio_flagged(bio, BIO_USER_MAPPED)) 178 120 rq->cmd_flags |= REQ_COPY_USER; 179 - 180 - blk_queue_bounce(q, &bio); 181 - bio_get(bio); 182 - blk_rq_bio_prep(q, rq, bio); 183 121 return 0; 122 + 123 + unmap_rq: 124 + __blk_rq_unmap_user(bio); 125 + rq->bio = NULL; 126 + return -EINVAL; 184 127 } 185 128 EXPORT_SYMBOL(blk_rq_map_user_iov); 186 129
+2 -6
block/blk-merge.c
··· 304 304 struct bio *nxt) 305 305 { 306 306 struct bio_vec end_bv = { NULL }, nxt_bv; 307 - struct bvec_iter iter; 308 307 309 308 if (!blk_queue_cluster(q)) 310 309 return 0; ··· 315 316 if (!bio_has_data(bio)) 316 317 return 1; 317 318 318 - bio_for_each_segment(end_bv, bio, iter) 319 - if (end_bv.bv_len == iter.bi_size) 320 - break; 321 - 322 - nxt_bv = bio_iovec(nxt); 319 + bio_get_last_bvec(bio, &end_bv); 320 + bio_get_first_bvec(nxt, &nxt_bv); 323 321 324 322 if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) 325 323 return 0;
+78 -33
drivers/nvme/host/core.c
··· 55 55 ns->disk->private_data = NULL; 56 56 spin_unlock(&dev_list_lock); 57 57 58 - nvme_put_ctrl(ns->ctrl); 59 58 put_disk(ns->disk); 59 + ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); 60 + nvme_put_ctrl(ns->ctrl); 60 61 kfree(ns); 61 62 } 62 63 ··· 184 183 goto out_unmap; 185 184 } 186 185 187 - if (meta_buffer) { 186 + if (meta_buffer && meta_len) { 188 187 struct bio_integrity_payload *bip; 189 188 190 189 meta = kmalloc(meta_len, GFP_KERNEL); ··· 374 373 375 374 if (copy_from_user(&io, uio, sizeof(io))) 376 375 return -EFAULT; 376 + if (io.flags) 377 + return -EINVAL; 377 378 378 379 switch (io.opcode) { 379 380 case nvme_cmd_write: ··· 427 424 return -EACCES; 428 425 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 429 426 return -EFAULT; 427 + if (cmd.flags) 428 + return -EINVAL; 430 429 431 430 memset(&c, 0, sizeof(c)); 432 431 c.common.opcode = cmd.opcode; ··· 561 556 u16 old_ms; 562 557 unsigned short bs; 563 558 559 + if (test_bit(NVME_NS_DEAD, &ns->flags)) { 560 + set_capacity(disk, 0); 561 + return -ENODEV; 562 + } 564 563 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { 565 564 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", 566 565 __func__, ns->ctrl->instance, ns->ns_id); ··· 840 831 return ret; 841 832 } 842 833 834 + static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, 835 + struct request_queue *q) 836 + { 837 + if (ctrl->max_hw_sectors) { 838 + u32 max_segments = 839 + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; 840 + 841 + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 842 + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 843 + } 844 + if (ctrl->stripe_size) 845 + blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9); 846 + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 847 + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); 848 + blk_queue_virt_boundary(q, ctrl->page_size - 1); 849 + } 850 + 843 851 /* 844 852 * Initialize the cached copies of the Identify data and various controller 845 853 * register in our nvme_ctrl structure. This should be called as soon as ··· 913 887 ctrl->max_hw_sectors = max_hw_sectors; 914 888 } 915 889 } 890 + 891 + nvme_set_queue_limits(ctrl, ctrl->admin_q); 916 892 917 893 kfree(id); 918 894 return 0; ··· 1146 1118 if (!ns) 1147 1119 return; 1148 1120 1121 + ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); 1122 + if (ns->instance < 0) 1123 + goto out_free_ns; 1124 + 1149 1125 ns->queue = blk_mq_init_queue(ctrl->tagset); 1150 1126 if (IS_ERR(ns->queue)) 1151 - goto out_free_ns; 1127 + goto out_release_instance; 1152 1128 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1153 1129 ns->queue->queuedata = ns; 1154 1130 ns->ctrl = ctrl; ··· 1166 1134 ns->disk = disk; 1167 1135 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1168 1136 1137 + 1169 1138 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1170 - if (ctrl->max_hw_sectors) { 1171 - blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); 1172 - blk_queue_max_segments(ns->queue, 1173 - (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); 1174 - } 1175 - if (ctrl->stripe_size) 1176 - blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); 1177 - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 1178 - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); 1179 - blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); 1139 + nvme_set_queue_limits(ctrl, ns->queue); 1180 1140 1181 1141 disk->major = nvme_major; 1182 1142 disk->first_minor = 0; ··· 1177 1153 disk->queue = ns->queue; 1178 1154 disk->driverfs_dev = ctrl->device; 1179 1155 disk->flags = GENHD_FL_EXT_DEVT; 1180 - sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); 1156 + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); 1181 1157 1182 1158 if (nvme_revalidate_disk(ns->disk)) 1183 1159 goto out_free_disk; ··· 1197 1173 kfree(disk); 1198 1174 out_free_queue: 1199 1175 blk_cleanup_queue(ns->queue); 1176 + out_release_instance: 1177 + ida_simple_remove(&ctrl->ns_ida, ns->instance); 1200 1178 out_free_ns: 1201 1179 kfree(ns); 1202 1180 } 1203 1181 1204 1182 static void nvme_ns_remove(struct nvme_ns *ns) 1205 1183 { 1206 - bool kill = nvme_io_incapable(ns->ctrl) && 1207 - !blk_queue_dying(ns->queue); 1184 + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) 1185 + return; 1208 1186 1209 - lockdep_assert_held(&ns->ctrl->namespaces_mutex); 1210 - 1211 - if (kill) { 1212 - blk_set_queue_dying(ns->queue); 1213 - 1214 - /* 1215 - * The controller was shutdown first if we got here through 1216 - * device removal. The shutdown may requeue outstanding 1217 - * requests. These need to be aborted immediately so 1218 - * del_gendisk doesn't block indefinitely for their completion. 1219 - */ 1220 - blk_mq_abort_requeue_list(ns->queue); 1221 - } 1222 1187 if (ns->disk->flags & GENHD_FL_UP) { 1223 1188 if (blk_get_integrity(ns->disk)) 1224 1189 blk_integrity_unregister(ns->disk); 1225 1190 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 1226 1191 &nvme_ns_attr_group); 1227 1192 del_gendisk(ns->disk); 1228 - } 1229 - if (kill || !blk_queue_dying(ns->queue)) { 1230 1193 blk_mq_abort_requeue_list(ns->queue); 1231 1194 blk_cleanup_queue(ns->queue); 1232 1195 } 1196 + mutex_lock(&ns->ctrl->namespaces_mutex); 1233 1197 list_del_init(&ns->list); 1198 + mutex_unlock(&ns->ctrl->namespaces_mutex); 1234 1199 nvme_put_ns(ns); 1235 1200 } 1236 1201 ··· 1313 1300 { 1314 1301 struct nvme_ns *ns, *next; 1315 1302 1316 - mutex_lock(&ctrl->namespaces_mutex); 1317 1303 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) 1318 1304 nvme_ns_remove(ns); 1319 - mutex_unlock(&ctrl->namespaces_mutex); 1320 1305 } 1321 1306 1322 1307 static DEFINE_IDA(nvme_instance_ida); ··· 1361 1350 1362 1351 put_device(ctrl->device); 1363 1352 nvme_release_instance(ctrl); 1353 + ida_destroy(&ctrl->ns_ida); 1364 1354 1365 1355 ctrl->ops->free_ctrl(ctrl); 1366 1356 } ··· 1402 1390 } 1403 1391 get_device(ctrl->device); 1404 1392 dev_set_drvdata(ctrl->device, ctrl); 1393 + ida_init(&ctrl->ns_ida); 1405 1394 1406 1395 spin_lock(&dev_list_lock); 1407 1396 list_add_tail(&ctrl->node, &nvme_ctrl_list); ··· 1413 1400 nvme_release_instance(ctrl); 1414 1401 out: 1415 1402 return ret; 1403 + } 1404 + 1405 + /** 1406 + * nvme_kill_queues(): Ends all namespace queues 1407 + * @ctrl: the dead controller that needs to end 1408 + * 1409 + * Call this function when the driver determines it is unable to get the 1410 + * controller in a state capable of servicing IO. 1411 + */ 1412 + void nvme_kill_queues(struct nvme_ctrl *ctrl) 1413 + { 1414 + struct nvme_ns *ns; 1415 + 1416 + mutex_lock(&ctrl->namespaces_mutex); 1417 + list_for_each_entry(ns, &ctrl->namespaces, list) { 1418 + if (!kref_get_unless_zero(&ns->kref)) 1419 + continue; 1420 + 1421 + /* 1422 + * Revalidating a dead namespace sets capacity to 0. This will 1423 + * end buffered writers dirtying pages that can't be synced. 1424 + */ 1425 + if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 1426 + revalidate_disk(ns->disk); 1427 + 1428 + blk_set_queue_dying(ns->queue); 1429 + blk_mq_abort_requeue_list(ns->queue); 1430 + blk_mq_start_stopped_hw_queues(ns->queue, true); 1431 + 1432 + nvme_put_ns(ns); 1433 + } 1434 + mutex_unlock(&ctrl->namespaces_mutex); 1416 1435 } 1417 1436 1418 1437 void nvme_stop_queues(struct nvme_ctrl *ctrl)
+8
drivers/nvme/host/nvme.h
··· 72 72 struct mutex namespaces_mutex; 73 73 struct device *device; /* char device */ 74 74 struct list_head node; 75 + struct ida ns_ida; 75 76 76 77 char name[12]; 77 78 char serial[20]; ··· 103 102 struct request_queue *queue; 104 103 struct gendisk *disk; 105 104 struct kref kref; 105 + int instance; 106 106 107 107 u8 eui[8]; 108 108 u8 uuid[16]; ··· 114 112 bool ext; 115 113 u8 pi_type; 116 114 int type; 115 + unsigned long flags; 116 + 117 + #define NVME_NS_REMOVING 0 118 + #define NVME_NS_DEAD 1 119 + 117 120 u64 mode_select_num_blocks; 118 121 u32 mode_select_block_len; 119 122 }; ··· 247 240 248 241 void nvme_stop_queues(struct nvme_ctrl *ctrl); 249 242 void nvme_start_queues(struct nvme_ctrl *ctrl); 243 + void nvme_kill_queues(struct nvme_ctrl *ctrl); 250 244 251 245 struct request *nvme_alloc_request(struct request_queue *q, 252 246 struct nvme_command *cmd, unsigned int flags);
+90 -59
drivers/nvme/host/pci.c
··· 86 86 87 87 static int nvme_reset(struct nvme_dev *dev); 88 88 static void nvme_process_cq(struct nvme_queue *nvmeq); 89 - static void nvme_remove_dead_ctrl(struct nvme_dev *dev); 90 89 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); 91 90 92 91 /* ··· 119 120 unsigned long flags; 120 121 121 122 #define NVME_CTRL_RESETTING 0 123 + #define NVME_CTRL_REMOVING 1 122 124 123 125 struct nvme_ctrl ctrl; 124 126 struct completion ioq_wait; ··· 286 286 return 0; 287 287 } 288 288 289 + static void nvme_queue_scan(struct nvme_dev *dev) 290 + { 291 + /* 292 + * Do not queue new scan work when a controller is reset during 293 + * removal. 294 + */ 295 + if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) 296 + return; 297 + queue_work(nvme_workq, &dev->scan_work); 298 + } 299 + 289 300 static void nvme_complete_async_event(struct nvme_dev *dev, 290 301 struct nvme_completion *cqe) 291 302 { ··· 311 300 switch (result & 0xff07) { 312 301 case NVME_AER_NOTICE_NS_CHANGED: 313 302 dev_info(dev->dev, "rescanning\n"); 314 - queue_work(nvme_workq, &dev->scan_work); 303 + nvme_queue_scan(dev); 315 304 default: 316 305 dev_warn(dev->dev, "async event result %08x\n", result); 317 306 } ··· 690 679 691 680 spin_lock_irq(&nvmeq->q_lock); 692 681 if (unlikely(nvmeq->cq_vector < 0)) { 693 - ret = BLK_MQ_RQ_QUEUE_BUSY; 682 + if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) 683 + ret = BLK_MQ_RQ_QUEUE_BUSY; 684 + else 685 + ret = BLK_MQ_RQ_QUEUE_ERROR; 694 686 spin_unlock_irq(&nvmeq->q_lock); 695 687 goto out; 696 688 } ··· 1264 1250 static void nvme_dev_remove_admin(struct nvme_dev *dev) 1265 1251 { 1266 1252 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 1253 + /* 1254 + * If the controller was reset during removal, it's possible 1255 + * user requests may be waiting on a stopped queue. Start the 1256 + * queue to flush these to completion. 1257 + */ 1258 + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); 1267 1259 blk_cleanup_queue(dev->ctrl.admin_q); 1268 1260 blk_mq_free_tag_set(&dev->admin_tagset); 1269 1261 } ··· 1710 1690 return 0; 1711 1691 dev->ctrl.tagset = &dev->tagset; 1712 1692 } 1713 - queue_work(nvme_workq, &dev->scan_work); 1693 + nvme_queue_scan(dev); 1714 1694 return 0; 1715 1695 } 1716 1696 1717 - static int nvme_dev_map(struct nvme_dev *dev) 1697 + static int nvme_pci_enable(struct nvme_dev *dev) 1718 1698 { 1719 1699 u64 cap; 1720 - int bars, result = -ENOMEM; 1700 + int result = -ENOMEM; 1721 1701 struct pci_dev *pdev = to_pci_dev(dev->dev); 1722 1702 1723 1703 if (pci_enable_device_mem(pdev)) ··· 1725 1705 1726 1706 dev->entry[0].vector = pdev->irq; 1727 1707 pci_set_master(pdev); 1728 - bars = pci_select_bars(pdev, IORESOURCE_MEM); 1729 - if (!bars) 1730 - goto disable_pci; 1731 - 1732 - if (pci_request_selected_regions(pdev, bars, "nvme")) 1733 - goto disable_pci; 1734 1708 1735 1709 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 1736 1710 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 1737 1711 goto disable; 1738 1712 1739 - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 1740 - if (!dev->bar) 1741 - goto disable; 1742 - 1743 1713 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 1744 1714 result = -ENODEV; 1745 - goto unmap; 1715 + goto disable; 1746 1716 } 1747 1717 1748 1718 /* ··· 1742 1732 if (!pdev->irq) { 1743 1733 result = pci_enable_msix(pdev, dev->entry, 1); 1744 1734 if (result < 0) 1745 - goto unmap; 1735 + goto disable; 1746 1736 } 1747 1737 1748 1738 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); ··· 1769 1759 pci_save_state(pdev); 1770 1760 return 0; 1771 1761 1772 - unmap: 1773 - iounmap(dev->bar); 1774 - dev->bar = NULL; 1775 1762 disable: 1776 - pci_release_regions(pdev); 1777 - disable_pci: 1778 1763 pci_disable_device(pdev); 1779 1764 return result; 1780 1765 } 1781 1766 1782 1767 static void nvme_dev_unmap(struct nvme_dev *dev) 1768 + { 1769 + if (dev->bar) 1770 + iounmap(dev->bar); 1771 + pci_release_regions(to_pci_dev(dev->dev)); 1772 + } 1773 + 1774 + static void nvme_pci_disable(struct nvme_dev *dev) 1783 1775 { 1784 1776 struct pci_dev *pdev = to_pci_dev(dev->dev); 1785 1777 ··· 1789 1777 pci_disable_msi(pdev); 1790 1778 else if (pdev->msix_enabled) 1791 1779 pci_disable_msix(pdev); 1792 - 1793 - if (dev->bar) { 1794 - iounmap(dev->bar); 1795 - dev->bar = NULL; 1796 - pci_release_regions(pdev); 1797 - } 1798 1780 1799 1781 if (pci_is_enabled(pdev)) { 1800 1782 pci_disable_pcie_error_reporting(pdev); ··· 1848 1842 nvme_dev_list_remove(dev); 1849 1843 1850 1844 mutex_lock(&dev->shutdown_lock); 1851 - if (dev->bar) { 1845 + if (pci_is_enabled(to_pci_dev(dev->dev))) { 1852 1846 nvme_stop_queues(&dev->ctrl); 1853 1847 csts = readl(dev->bar + NVME_REG_CSTS); 1854 1848 } ··· 1861 1855 nvme_disable_io_queues(dev); 1862 1856 nvme_disable_admin_queue(dev, shutdown); 1863 1857 } 1864 - nvme_dev_unmap(dev); 1858 + nvme_pci_disable(dev); 1865 1859 1866 1860 for (i = dev->queue_count - 1; i >= 0; i--) 1867 1861 nvme_clear_queue(dev->queues[i]); ··· 1905 1899 kfree(dev); 1906 1900 } 1907 1901 1902 + static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) 1903 + { 1904 + dev_warn(dev->dev, "Removing after probe failure status: %d\n", status); 1905 + 1906 + kref_get(&dev->ctrl.kref); 1907 + nvme_dev_disable(dev, false); 1908 + if (!schedule_work(&dev->remove_work)) 1909 + nvme_put_ctrl(&dev->ctrl); 1910 + } 1911 + 1908 1912 static void nvme_reset_work(struct work_struct *work) 1909 1913 { 1910 1914 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 1911 - int result; 1915 + int result = -ENODEV; 1912 1916 1913 1917 if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) 1914 1918 goto out; ··· 1927 1911 * If we're called to reset a live controller first shut it down before 1928 1912 * moving on. 1929 1913 */ 1930 - if (dev->bar) 1914 + if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) 1931 1915 nvme_dev_disable(dev, false); 1932 1916 1933 1917 set_bit(NVME_CTRL_RESETTING, &dev->flags); 1934 1918 1935 - result = nvme_dev_map(dev); 1919 + result = nvme_pci_enable(dev); 1936 1920 if (result) 1937 1921 goto out; 1938 1922 1939 1923 result = nvme_configure_admin_queue(dev); 1940 1924 if (result) 1941 - goto unmap; 1925 + goto out; 1942 1926 1943 1927 nvme_init_queue(dev->queues[0], 0); 1944 1928 result = nvme_alloc_admin_tags(dev); 1945 1929 if (result) 1946 - goto disable; 1930 + goto out; 1947 1931 1948 1932 result = nvme_init_identify(&dev->ctrl); 1949 1933 if (result) 1950 - goto free_tags; 1934 + goto out; 1951 1935 1952 1936 result = nvme_setup_io_queues(dev); 1953 1937 if (result) 1954 - goto free_tags; 1938 + goto out; 1955 1939 1956 1940 dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; 1957 1941 1958 1942 result = nvme_dev_list_add(dev); 1959 1943 if (result) 1960 - goto remove; 1944 + goto out; 1961 1945 1962 1946 /* 1963 1947 * Keep the controller around but remove all namespaces if we don't have ··· 1974 1958 clear_bit(NVME_CTRL_RESETTING, &dev->flags); 1975 1959 return; 1976 1960 1977 - remove: 1978 - nvme_dev_list_remove(dev); 1979 - free_tags: 1980 - nvme_dev_remove_admin(dev); 1981 - blk_put_queue(dev->ctrl.admin_q); 1982 - dev->ctrl.admin_q = NULL; 1983 - dev->queues[0]->tags = NULL; 1984 - disable: 1985 - nvme_disable_admin_queue(dev, false); 1986 - unmap: 1987 - nvme_dev_unmap(dev); 1988 1961 out: 1989 - nvme_remove_dead_ctrl(dev); 1962 + nvme_remove_dead_ctrl(dev, result); 1990 1963 } 1991 1964 1992 1965 static void nvme_remove_dead_ctrl_work(struct work_struct *work) ··· 1983 1978 struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); 1984 1979 struct pci_dev *pdev = to_pci_dev(dev->dev); 1985 1980 1981 + nvme_kill_queues(&dev->ctrl); 1986 1982 if (pci_get_drvdata(pdev)) 1987 1983 pci_stop_and_remove_bus_device_locked(pdev); 1988 1984 nvme_put_ctrl(&dev->ctrl); 1989 - } 1990 - 1991 - static void nvme_remove_dead_ctrl(struct nvme_dev *dev) 1992 - { 1993 - dev_warn(dev->dev, "Removing after probe failure\n"); 1994 - kref_get(&dev->ctrl.kref); 1995 - if (!schedule_work(&dev->remove_work)) 1996 - nvme_put_ctrl(&dev->ctrl); 1997 1985 } 1998 1986 1999 1987 static int nvme_reset(struct nvme_dev *dev) ··· 2040 2042 .free_ctrl = nvme_pci_free_ctrl, 2041 2043 }; 2042 2044 2045 + static int nvme_dev_map(struct nvme_dev *dev) 2046 + { 2047 + int bars; 2048 + struct pci_dev *pdev = to_pci_dev(dev->dev); 2049 + 2050 + bars = pci_select_bars(pdev, IORESOURCE_MEM); 2051 + if (!bars) 2052 + return -ENODEV; 2053 + if (pci_request_selected_regions(pdev, bars, "nvme")) 2054 + return -ENODEV; 2055 + 2056 + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); 2057 + if (!dev->bar) 2058 + goto release; 2059 + 2060 + return 0; 2061 + release: 2062 + pci_release_regions(pdev); 2063 + return -ENODEV; 2064 + } 2065 + 2043 2066 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2044 2067 { 2045 2068 int node, result = -ENOMEM; ··· 2085 2066 dev->dev = get_device(&pdev->dev); 2086 2067 pci_set_drvdata(pdev, dev); 2087 2068 2069 + result = nvme_dev_map(dev); 2070 + if (result) 2071 + goto free; 2072 + 2088 2073 INIT_LIST_HEAD(&dev->node); 2089 2074 INIT_WORK(&dev->scan_work, nvme_dev_scan); 2090 2075 INIT_WORK(&dev->reset_work, nvme_reset_work); ··· 2112 2089 nvme_release_prp_pools(dev); 2113 2090 put_pci: 2114 2091 put_device(dev->dev); 2092 + nvme_dev_unmap(dev); 2115 2093 free: 2116 2094 kfree(dev->queues); 2117 2095 kfree(dev->entry); ··· 2136 2112 nvme_dev_disable(dev, true); 2137 2113 } 2138 2114 2115 + /* 2116 + * The driver's remove may be called on a device in a partially initialized 2117 + * state. This function must not have any dependencies on the device state in 2118 + * order to proceed. 2119 + */ 2139 2120 static void nvme_remove(struct pci_dev *pdev) 2140 2121 { 2141 2122 struct nvme_dev *dev = pci_get_drvdata(pdev); 2142 2123 2124 + set_bit(NVME_CTRL_REMOVING, &dev->flags); 2143 2125 pci_set_drvdata(pdev, NULL); 2144 2126 flush_work(&dev->scan_work); 2145 2127 nvme_remove_namespaces(&dev->ctrl); ··· 2156 2126 nvme_free_queues(dev, 0); 2157 2127 nvme_release_cmb(dev); 2158 2128 nvme_release_prp_pools(dev); 2129 + nvme_dev_unmap(dev); 2159 2130 nvme_put_ctrl(&dev->ctrl); 2160 2131 } 2161 2132
+41 -13
fs/fs-writeback.c
··· 223 223 #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) 224 224 /* one round can affect upto 5 slots */ 225 225 226 + static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); 227 + static struct workqueue_struct *isw_wq; 228 + 226 229 void __inode_attach_wb(struct inode *inode, struct page *page) 227 230 { 228 231 struct backing_dev_info *bdi = inode_to_bdi(inode); ··· 320 317 struct inode_switch_wbs_context *isw = 321 318 container_of(work, struct inode_switch_wbs_context, work); 322 319 struct inode *inode = isw->inode; 323 - struct super_block *sb = inode->i_sb; 324 320 struct address_space *mapping = inode->i_mapping; 325 321 struct bdi_writeback *old_wb = inode->i_wb; 326 322 struct bdi_writeback *new_wb = isw->new_wb; ··· 426 424 wb_put(new_wb); 427 425 428 426 iput(inode); 429 - deactivate_super(sb); 430 427 kfree(isw); 428 + 429 + atomic_dec(&isw_nr_in_flight); 431 430 } 432 431 433 432 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) ··· 438 435 439 436 /* needs to grab bh-unsafe locks, bounce to work item */ 440 437 INIT_WORK(&isw->work, inode_switch_wbs_work_fn); 441 - schedule_work(&isw->work); 438 + queue_work(isw_wq, &isw->work); 442 439 } 443 440 444 441 /** ··· 474 471 475 472 /* while holding I_WB_SWITCH, no one else can update the association */ 476 473 spin_lock(&inode->i_lock); 477 - 478 - if (inode->i_state & (I_WB_SWITCH | I_FREEING) || 479 - inode_to_wb(inode) == isw->new_wb) 480 - goto out_unlock; 481 - 482 - if (!atomic_inc_not_zero(&inode->i_sb->s_active)) 483 - goto out_unlock; 484 - 474 + if (!(inode->i_sb->s_flags & MS_ACTIVE) || 475 + inode->i_state & (I_WB_SWITCH | I_FREEING) || 476 + inode_to_wb(inode) == isw->new_wb) { 477 + spin_unlock(&inode->i_lock); 478 + goto out_free; 479 + } 485 480 inode->i_state |= I_WB_SWITCH; 486 481 spin_unlock(&inode->i_lock); 487 482 488 483 ihold(inode); 489 484 isw->inode = inode; 485 + 486 + atomic_inc(&isw_nr_in_flight); 490 487 491 488 /* 492 489 * In addition to synchronizing among switchers, I_WB_SWITCH tells ··· 497 494 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); 498 495 return; 499 496 500 - out_unlock: 501 - spin_unlock(&inode->i_lock); 502 497 out_free: 503 498 if (isw->new_wb) 504 499 wb_put(isw->new_wb); ··· 847 846 if (last_wb) 848 847 wb_put(last_wb); 849 848 } 849 + 850 + /** 851 + * cgroup_writeback_umount - flush inode wb switches for umount 852 + * 853 + * This function is called when a super_block is about to be destroyed and 854 + * flushes in-flight inode wb switches. An inode wb switch goes through 855 + * RCU and then workqueue, so the two need to be flushed in order to ensure 856 + * that all previously scheduled switches are finished. As wb switches are 857 + * rare occurrences and synchronize_rcu() can take a while, perform 858 + * flushing iff wb switches are in flight. 859 + */ 860 + void cgroup_writeback_umount(void) 861 + { 862 + if (atomic_read(&isw_nr_in_flight)) { 863 + synchronize_rcu(); 864 + flush_workqueue(isw_wq); 865 + } 866 + } 867 + 868 + static int __init cgroup_writeback_init(void) 869 + { 870 + isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); 871 + if (!isw_wq) 872 + return -ENOMEM; 873 + return 0; 874 + } 875 + fs_initcall(cgroup_writeback_init); 850 876 851 877 #else /* CONFIG_CGROUP_WRITEBACK */ 852 878
+1
fs/super.c
··· 415 415 sb->s_flags &= ~MS_ACTIVE; 416 416 417 417 fsnotify_unmount_inodes(sb); 418 + cgroup_writeback_umount(); 418 419 419 420 evict_inodes(sb); 420 421
+37
include/linux/bio.h
··· 310 310 bio->bi_flags &= ~(1U << bit); 311 311 } 312 312 313 + static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) 314 + { 315 + *bv = bio_iovec(bio); 316 + } 317 + 318 + static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) 319 + { 320 + struct bvec_iter iter = bio->bi_iter; 321 + int idx; 322 + 323 + if (!bio_flagged(bio, BIO_CLONED)) { 324 + *bv = bio->bi_io_vec[bio->bi_vcnt - 1]; 325 + return; 326 + } 327 + 328 + if (unlikely(!bio_multiple_segments(bio))) { 329 + *bv = bio_iovec(bio); 330 + return; 331 + } 332 + 333 + bio_advance_iter(bio, &iter, iter.bi_size); 334 + 335 + if (!iter.bi_bvec_done) 336 + idx = iter.bi_idx - 1; 337 + else /* in the middle of bvec */ 338 + idx = iter.bi_idx; 339 + 340 + *bv = bio->bi_io_vec[idx]; 341 + 342 + /* 343 + * iter.bi_bvec_done records actual length of the last bvec 344 + * if this bio ends in the middle of one io vector 345 + */ 346 + if (iter.bi_bvec_done) 347 + bv->bv_len = iter.bi_bvec_done; 348 + } 349 + 313 350 enum bip_flags { 314 351 BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ 315 352 BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
+18 -7
include/linux/blkdev.h
··· 895 895 { 896 896 struct request_queue *q = rq->q; 897 897 898 - if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) 898 + if (unlikely(rq->cmd_type != REQ_TYPE_FS)) 899 899 return q->limits.max_hw_sectors; 900 900 901 901 if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) ··· 1372 1372 page_cache_release(p.v); 1373 1373 } 1374 1374 1375 + static inline bool __bvec_gap_to_prev(struct request_queue *q, 1376 + struct bio_vec *bprv, unsigned int offset) 1377 + { 1378 + return offset || 1379 + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); 1380 + } 1381 + 1375 1382 /* 1376 1383 * Check if adding a bio_vec after bprv with offset would create a gap in 1377 1384 * the SG list. Most drivers don't care about this, but some do. ··· 1388 1381 { 1389 1382 if (!queue_virt_boundary(q)) 1390 1383 return false; 1391 - return offset || 1392 - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); 1384 + return __bvec_gap_to_prev(q, bprv, offset); 1393 1385 } 1394 1386 1395 1387 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, 1396 1388 struct bio *next) 1397 1389 { 1398 - if (!bio_has_data(prev)) 1399 - return false; 1390 + if (bio_has_data(prev) && queue_virt_boundary(q)) { 1391 + struct bio_vec pb, nb; 1400 1392 1401 - return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], 1402 - next->bi_io_vec[0].bv_offset); 1393 + bio_get_last_bvec(prev, &pb); 1394 + bio_get_first_bvec(next, &nb); 1395 + 1396 + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); 1397 + } 1398 + 1399 + return false; 1403 1400 } 1404 1401 1405 1402 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
+5
include/linux/writeback.h
··· 198 198 void wbc_detach_inode(struct writeback_control *wbc); 199 199 void wbc_account_io(struct writeback_control *wbc, struct page *page, 200 200 size_t bytes); 201 + void cgroup_writeback_umount(void); 201 202 202 203 /** 203 204 * inode_attach_wb - associate an inode with its wb ··· 299 298 300 299 static inline void wbc_account_io(struct writeback_control *wbc, 301 300 struct page *page, size_t bytes) 301 + { 302 + } 303 + 304 + static inline void cgroup_writeback_umount(void) 302 305 { 303 306 } 304 307