Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-linus

Pull NVMe changes from Sagi:

"This set consists of various fixes and cleanups:
- controller removal race fix from Balbir
- quirk additions from Gabriel and Jian-Hong
- nvme-pci power state save fix from Mario
- Add 64bit user commands (for 64bit registers) from Marta
- nvme-rdma/nvme-tcp fixes from Max, Mark and Me
- Minor cleanups and nits from James, Dan and John"

* 'nvme-5.4' of git://git.infradead.org/nvme:
nvme-rdma: fix possible use-after-free in connect timeout
nvme: Move ctrl sqsize to generic space
nvme: Add ctrl attributes for queue_count and sqsize
nvme: allow 64-bit results in passthru commands
nvme: Add quirk for Kingston NVME SSD running FW E8FK11.T
nvmet-tcp: remove superflous check on request sgl
Added QUIRKs for ADATA XPG SX8200 Pro 512GB
nvme-rdma: Fix max_hw_sectors calculation
nvme: fix an error code in nvme_init_subsystem()
nvme-pci: Save PCI state before putting drive into deepest state
nvme-tcp: fix wrong stop condition in io_work
nvme-pci: Fix a race in controller removal
nvmet: change ppl to lpp

+177 -51
+113 -19
drivers/nvme/host/core.c
··· 102 102 */ 103 103 if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags)) 104 104 return; 105 - revalidate_disk(ns->disk); 106 105 blk_set_queue_dying(ns->queue); 107 106 /* Forcibly unquiesce queues to avoid blocking dispatch */ 108 107 blk_mq_unquiesce_queue(ns->queue); 108 + /* 109 + * Revalidate after unblocking dispatchers that may be holding bd_butex 110 + */ 111 + revalidate_disk(ns->disk); 109 112 } 110 113 111 114 static void nvme_queue_scan(struct nvme_ctrl *ctrl) ··· 850 847 static int nvme_submit_user_cmd(struct request_queue *q, 851 848 struct nvme_command *cmd, void __user *ubuffer, 852 849 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 853 - u32 meta_seed, u32 *result, unsigned timeout) 850 + u32 meta_seed, u64 *result, unsigned timeout) 854 851 { 855 852 bool write = nvme_is_write(cmd); 856 853 struct nvme_ns *ns = q->queuedata; ··· 891 888 else 892 889 ret = nvme_req(req)->status; 893 890 if (result) 894 - *result = le32_to_cpu(nvme_req(req)->result.u32); 891 + *result = le64_to_cpu(nvme_req(req)->result.u64); 895 892 if (meta && !ret && !write) { 896 893 if (copy_to_user(meta_buffer, meta, meta_len)) 897 894 ret = -EFAULT; ··· 1338 1335 struct nvme_command c; 1339 1336 unsigned timeout = 0; 1340 1337 u32 effects; 1338 + u64 result; 1339 + int status; 1340 + 1341 + if (!capable(CAP_SYS_ADMIN)) 1342 + return -EACCES; 1343 + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 1344 + return -EFAULT; 1345 + if (cmd.flags) 1346 + return -EINVAL; 1347 + 1348 + memset(&c, 0, sizeof(c)); 1349 + c.common.opcode = cmd.opcode; 1350 + c.common.flags = cmd.flags; 1351 + c.common.nsid = cpu_to_le32(cmd.nsid); 1352 + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1353 + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1354 + c.common.cdw10 = cpu_to_le32(cmd.cdw10); 1355 + c.common.cdw11 = cpu_to_le32(cmd.cdw11); 1356 + c.common.cdw12 = cpu_to_le32(cmd.cdw12); 1357 + c.common.cdw13 = cpu_to_le32(cmd.cdw13); 1358 + c.common.cdw14 = cpu_to_le32(cmd.cdw14); 1359 + c.common.cdw15 = cpu_to_le32(cmd.cdw15); 1360 + 1361 + if (cmd.timeout_ms) 1362 + timeout = msecs_to_jiffies(cmd.timeout_ms); 1363 + 1364 + effects = nvme_passthru_start(ctrl, ns, cmd.opcode); 1365 + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 1366 + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, 1367 + (void __user *)(uintptr_t)cmd.metadata, 1368 + cmd.metadata_len, 0, &result, timeout); 1369 + nvme_passthru_end(ctrl, effects); 1370 + 1371 + if (status >= 0) { 1372 + if (put_user(result, &ucmd->result)) 1373 + return -EFAULT; 1374 + } 1375 + 1376 + return status; 1377 + } 1378 + 1379 + static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 1380 + struct nvme_passthru_cmd64 __user *ucmd) 1381 + { 1382 + struct nvme_passthru_cmd64 cmd; 1383 + struct nvme_command c; 1384 + unsigned timeout = 0; 1385 + u32 effects; 1341 1386 int status; 1342 1387 1343 1388 if (!capable(CAP_SYS_ADMIN)) ··· 1456 1405 srcu_read_unlock(&head->srcu, idx); 1457 1406 } 1458 1407 1408 + static bool is_ctrl_ioctl(unsigned int cmd) 1409 + { 1410 + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 1411 + return true; 1412 + if (is_sed_ioctl(cmd)) 1413 + return true; 1414 + return false; 1415 + } 1416 + 1417 + static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 1418 + void __user *argp, 1419 + struct nvme_ns_head *head, 1420 + int srcu_idx) 1421 + { 1422 + struct nvme_ctrl *ctrl = ns->ctrl; 1423 + int ret; 1424 + 1425 + nvme_get_ctrl(ns->ctrl); 1426 + nvme_put_ns_from_disk(head, srcu_idx); 1427 + 1428 + switch (cmd) { 1429 + case NVME_IOCTL_ADMIN_CMD: 1430 + ret = nvme_user_cmd(ctrl, NULL, argp); 1431 + break; 1432 + case NVME_IOCTL_ADMIN64_CMD: 1433 + ret = nvme_user_cmd64(ctrl, NULL, argp); 1434 + break; 1435 + default: 1436 + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); 1437 + break; 1438 + } 1439 + nvme_put_ctrl(ctrl); 1440 + return ret; 1441 + } 1442 + 1459 1443 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1460 1444 unsigned int cmd, unsigned long arg) 1461 1445 { ··· 1508 1422 * seperately and drop the ns SRCU reference early. This avoids a 1509 1423 * deadlock when deleting namespaces using the passthrough interface. 1510 1424 */ 1511 - if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { 1512 - struct nvme_ctrl *ctrl = ns->ctrl; 1513 - 1514 - nvme_get_ctrl(ns->ctrl); 1515 - nvme_put_ns_from_disk(head, srcu_idx); 1516 - 1517 - if (cmd == NVME_IOCTL_ADMIN_CMD) 1518 - ret = nvme_user_cmd(ctrl, NULL, argp); 1519 - else 1520 - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); 1521 - 1522 - nvme_put_ctrl(ctrl); 1523 - return ret; 1524 - } 1425 + if (is_ctrl_ioctl(cmd)) 1426 + return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); 1525 1427 1526 1428 switch (cmd) { 1527 1429 case NVME_IOCTL_ID: ··· 1521 1447 break; 1522 1448 case NVME_IOCTL_SUBMIT_IO: 1523 1449 ret = nvme_submit_io(ns, argp); 1450 + break; 1451 + case NVME_IOCTL_IO64_CMD: 1452 + ret = nvme_user_cmd64(ns->ctrl, ns, argp); 1524 1453 break; 1525 1454 default: 1526 1455 if (ns->ndev) ··· 2366 2289 .vid = 0x14a4, 2367 2290 .fr = "22301111", 2368 2291 .quirks = NVME_QUIRK_SIMPLE_SUSPEND, 2292 + }, 2293 + { 2294 + /* 2295 + * This Kingston E8FK11.T firmware version has no interrupt 2296 + * after resume with actions related to suspend to idle 2297 + * https://bugzilla.kernel.org/show_bug.cgi?id=204887 2298 + */ 2299 + .vid = 0x2646, 2300 + .fr = "E8FK11.T", 2301 + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, 2369 2302 } 2370 2303 }; 2371 2304 ··· 2627 2540 list_add_tail(&subsys->entry, &nvme_subsystems); 2628 2541 } 2629 2542 2630 - if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, 2631 - dev_name(ctrl->device))) { 2543 + ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, 2544 + dev_name(ctrl->device)); 2545 + if (ret) { 2632 2546 dev_err(ctrl->device, 2633 2547 "failed to create sysfs link from subsystem.\n"); 2634 2548 goto out_put_subsystem; ··· 2926 2838 switch (cmd) { 2927 2839 case NVME_IOCTL_ADMIN_CMD: 2928 2840 return nvme_user_cmd(ctrl, NULL, argp); 2841 + case NVME_IOCTL_ADMIN64_CMD: 2842 + return nvme_user_cmd64(ctrl, NULL, argp); 2929 2843 case NVME_IOCTL_IO_CMD: 2930 2844 return nvme_dev_user_cmd(ctrl, argp); 2931 2845 case NVME_IOCTL_RESET: ··· 3135 3045 3136 3046 nvme_show_int_function(cntlid); 3137 3047 nvme_show_int_function(numa_node); 3048 + nvme_show_int_function(queue_count); 3049 + nvme_show_int_function(sqsize); 3138 3050 3139 3051 static ssize_t nvme_sysfs_delete(struct device *dev, 3140 3052 struct device_attribute *attr, const char *buf, ··· 3217 3125 &dev_attr_address.attr, 3218 3126 &dev_attr_state.attr, 3219 3127 &dev_attr_numa_node.attr, 3128 + &dev_attr_queue_count.attr, 3129 + &dev_attr_sqsize.attr, 3220 3130 NULL 3221 3131 }; 3222 3132
+1 -1
drivers/nvme/host/nvme.h
··· 221 221 u16 oacs; 222 222 u16 nssa; 223 223 u16 nr_streams; 224 + u16 sqsize; 224 225 u32 max_namespaces; 225 226 atomic_t abort_limit; 226 227 u8 vwc; ··· 270 269 u16 hmmaxd; 271 270 272 271 /* Fabrics only */ 273 - u16 sqsize; 274 272 u32 ioccsz; 275 273 u32 iorcsz; 276 274 u16 icdoff;
+13 -7
drivers/nvme/host/pci.c
··· 2946 2946 if (ret < 0) 2947 2947 goto unfreeze; 2948 2948 2949 + /* 2950 + * A saved state prevents pci pm from generically controlling the 2951 + * device's power. If we're using protocol specific settings, we don't 2952 + * want pci interfering. 2953 + */ 2954 + pci_save_state(pdev); 2955 + 2949 2956 ret = nvme_set_power_state(ctrl, ctrl->npss); 2950 2957 if (ret < 0) 2951 2958 goto unfreeze; 2952 2959 2953 2960 if (ret) { 2961 + /* discard the saved state */ 2962 + pci_load_saved_state(pdev, NULL); 2963 + 2954 2964 /* 2955 2965 * Clearing npss forces a controller reset on resume. The 2956 2966 * correct value will be resdicovered then. ··· 2968 2958 nvme_dev_disable(ndev, true); 2969 2959 ctrl->npss = 0; 2970 2960 ret = 0; 2971 - goto unfreeze; 2972 2961 } 2973 - /* 2974 - * A saved state prevents pci pm from generically controlling the 2975 - * device's power. If we're using protocol specific settings, we don't 2976 - * want pci interfering. 2977 - */ 2978 - pci_save_state(pdev); 2979 2962 unfreeze: 2980 2963 nvme_unfreeze(ctrl); 2981 2964 return ret; ··· 3093 3090 .driver_data = NVME_QUIRK_LIGHTNVM, }, 3094 3091 { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ 3095 3092 .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3093 + { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ 3094 + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | 3095 + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, 3096 3096 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 3097 3097 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, 3098 3098 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+13 -6
drivers/nvme/host/rdma.c
··· 427 427 static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev) 428 428 { 429 429 return min_t(u32, NVME_RDMA_MAX_SEGMENTS, 430 - ibdev->attrs.max_fast_reg_page_list_len); 430 + ibdev->attrs.max_fast_reg_page_list_len - 1); 431 431 } 432 432 433 433 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) ··· 437 437 const int cq_factor = send_wr_factor + 1; /* + RECV */ 438 438 int comp_vector, idx = nvme_rdma_queue_idx(queue); 439 439 enum ib_poll_context poll_ctx; 440 - int ret; 440 + int ret, pages_per_mr; 441 441 442 442 queue->device = nvme_rdma_find_get_device(queue->cm_id); 443 443 if (!queue->device) { ··· 479 479 goto out_destroy_qp; 480 480 } 481 481 482 + /* 483 + * Currently we don't use SG_GAPS MR's so if the first entry is 484 + * misaligned we'll end up using two entries for a single data page, 485 + * so one additional entry is required. 486 + */ 487 + pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1; 482 488 ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs, 483 489 queue->queue_size, 484 490 IB_MR_TYPE_MEM_REG, 485 - nvme_rdma_get_max_fr_pages(ibdev), 0); 491 + pages_per_mr, 0); 486 492 if (ret) { 487 493 dev_err(queue->ctrl->ctrl.device, 488 494 "failed to initialize MR pool sized %d for QID %d\n", ··· 620 614 if (!ret) { 621 615 set_bit(NVME_RDMA_Q_LIVE, &queue->flags); 622 616 } else { 623 - __nvme_rdma_stop_queue(queue); 617 + if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) 618 + __nvme_rdma_stop_queue(queue); 624 619 dev_info(ctrl->ctrl.device, 625 620 "failed to connect queue: %d ret=%d\n", idx, ret); 626 621 } ··· 827 820 if (error) 828 821 goto out_stop_queue; 829 822 830 - ctrl->ctrl.max_hw_sectors = 831 - (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); 823 + ctrl->ctrl.max_segments = ctrl->max_fr_pages; 824 + ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9); 832 825 833 826 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 834 827
+2 -2
drivers/nvme/host/tcp.c
··· 1042 1042 { 1043 1043 struct nvme_tcp_queue *queue = 1044 1044 container_of(w, struct nvme_tcp_queue, io_work); 1045 - unsigned long start = jiffies + msecs_to_jiffies(1); 1045 + unsigned long deadline = jiffies + msecs_to_jiffies(1); 1046 1046 1047 1047 do { 1048 1048 bool pending = false; ··· 1067 1067 if (!pending) 1068 1068 return; 1069 1069 1070 - } while (time_after(jiffies, start)); /* quota is exhausted */ 1070 + } while (!time_after(jiffies, deadline)); /* quota is exhausted */ 1071 1071 1072 1072 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 1073 1073 }
+8 -8
drivers/nvme/target/io-cmd-bdev.c
··· 11 11 void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) 12 12 { 13 13 const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; 14 - /* Number of physical blocks per logical block. */ 15 - const u32 ppl = ql->physical_block_size / ql->logical_block_size; 16 - /* Physical blocks per logical block, 0's based. */ 17 - const __le16 ppl0b = to0based(ppl); 14 + /* Number of logical blocks per physical block. */ 15 + const u32 lpp = ql->physical_block_size / ql->logical_block_size; 16 + /* Logical blocks per physical block, 0's based. */ 17 + const __le16 lpp0b = to0based(lpp); 18 18 19 19 /* 20 20 * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, ··· 25 25 * field from the identify controller data structure should be used. 26 26 */ 27 27 id->nsfeat |= 1 << 1; 28 - id->nawun = ppl0b; 29 - id->nawupf = ppl0b; 30 - id->nacwu = ppl0b; 28 + id->nawun = lpp0b; 29 + id->nawupf = lpp0b; 30 + id->nacwu = lpp0b; 31 31 32 32 /* 33 33 * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and ··· 36 36 */ 37 37 id->nsfeat |= 1 << 4; 38 38 /* NPWG = Namespace Preferred Write Granularity. 0's based */ 39 - id->npwg = ppl0b; 39 + id->npwg = lpp0b; 40 40 /* NPWA = Namespace Preferred Write Alignment. 0's based */ 41 41 id->npwa = id->npwg; 42 42 /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */
+4 -8
drivers/nvme/target/tcp.c
··· 348 348 349 349 return 0; 350 350 err: 351 - if (cmd->req.sg_cnt) 352 - sgl_free(cmd->req.sg); 351 + sgl_free(cmd->req.sg); 353 352 return NVME_SC_INTERNAL; 354 353 } 355 354 ··· 553 554 554 555 if (queue->nvme_sq.sqhd_disabled) { 555 556 kfree(cmd->iov); 556 - if (cmd->req.sg_cnt) 557 - sgl_free(cmd->req.sg); 557 + sgl_free(cmd->req.sg); 558 558 } 559 559 560 560 return 1; ··· 584 586 return -EAGAIN; 585 587 586 588 kfree(cmd->iov); 587 - if (cmd->req.sg_cnt) 588 - sgl_free(cmd->req.sg); 589 + sgl_free(cmd->req.sg); 589 590 cmd->queue->snd_cmd = NULL; 590 591 nvmet_tcp_put_cmd(cmd); 591 592 return 1; ··· 1307 1310 nvmet_req_uninit(&cmd->req); 1308 1311 nvmet_tcp_unmap_pdu_iovec(cmd); 1309 1312 kfree(cmd->iov); 1310 - if (cmd->req.sg_cnt) 1311 - sgl_free(cmd->req.sg); 1313 + sgl_free(cmd->req.sg); 1312 1314 } 1313 1315 1314 1316 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
+23
include/uapi/linux/nvme_ioctl.h
··· 45 45 __u32 result; 46 46 }; 47 47 48 + struct nvme_passthru_cmd64 { 49 + __u8 opcode; 50 + __u8 flags; 51 + __u16 rsvd1; 52 + __u32 nsid; 53 + __u32 cdw2; 54 + __u32 cdw3; 55 + __u64 metadata; 56 + __u64 addr; 57 + __u32 metadata_len; 58 + __u32 data_len; 59 + __u32 cdw10; 60 + __u32 cdw11; 61 + __u32 cdw12; 62 + __u32 cdw13; 63 + __u32 cdw14; 64 + __u32 cdw15; 65 + __u32 timeout_ms; 66 + __u64 result; 67 + }; 68 + 48 69 #define nvme_admin_cmd nvme_passthru_cmd 49 70 50 71 #define NVME_IOCTL_ID _IO('N', 0x40) ··· 75 54 #define NVME_IOCTL_RESET _IO('N', 0x44) 76 55 #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) 77 56 #define NVME_IOCTL_RESCAN _IO('N', 0x46) 57 + #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) 58 + #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) 78 59 79 60 #endif /* _UAPI_LINUX_NVME_IOCTL_H */