Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'block-5.10-2020-10-30' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

- null_blk zone fixes (Damien, Kanchan)

- NVMe pull request from Christoph:
- improve zone revalidation (Keith Busch)
- gracefully handle zero length messages in nvme-rdma (zhenwei pi)
- nvme-fc error handling fixes (James Smart)
- nvmet tracing NULL pointer dereference fix (Chaitanya Kulkarni)"

- xsysace platform fixes (Andy)

- scatterlist type cleanup (David)

- blk-cgroup memory fixes (Gabriel)

- nbd block size update fix (Ming)

- Flush completion state fix (Ming)

- bio_add_hw_page() iteration fix (Naohiro)

* tag 'block-5.10-2020-10-30' of git://git.kernel.dk/linux-block:
blk-mq: mark flush request as IDLE in flush_end_io()
lib/scatterlist: use consistent sg_copy_buffer() return type
xsysace: use platform_get_resource() and platform_get_irq_optional()
null_blk: Fix locking in zoned mode
null_blk: Fix zone reset all tracing
nbd: don't update block size after device is started
block: advance iov_iter on bio_add_hw_page failure
null_blk: synchronization fix for zoned device
nvmet: fix a NULL pointer dereference when tracing the flush command
nvme-fc: remove nvme_fc_terminate_io()
nvme-fc: eliminate terminate_io use by nvme_fc_error_recovery
nvme-fc: remove err_work work item
nvme-fc: track error_recovery while connecting
nvme-rdma: handle unexpected nvme completion data length
nvme: ignore zone validate errors on subsequent scans
blk-cgroup: Pre-allocate tree node on blkg_conf_prep
blk-cgroup: Fix memleak on error path

+283 -238
+7 -4
block/bio.c
··· 1044 1044 ssize_t size, left; 1045 1045 unsigned len, i; 1046 1046 size_t offset; 1047 + int ret = 0; 1047 1048 1048 1049 if (WARN_ON_ONCE(!max_append_sectors)) 1049 1050 return 0; ··· 1067 1066 1068 1067 len = min_t(size_t, PAGE_SIZE - offset, left); 1069 1068 if (bio_add_hw_page(q, bio, page, len, offset, 1070 - max_append_sectors, &same_page) != len) 1071 - return -EINVAL; 1069 + max_append_sectors, &same_page) != len) { 1070 + ret = -EINVAL; 1071 + break; 1072 + } 1072 1073 if (same_page) 1073 1074 put_page(page); 1074 1075 offset = 0; 1075 1076 } 1076 1077 1077 - iov_iter_advance(iter, size); 1078 - return 0; 1078 + iov_iter_advance(iter, size - left); 1079 + return ret; 1079 1080 } 1080 1081 1081 1082 /**
+13 -2
block/blk-cgroup.c
··· 657 657 goto fail; 658 658 } 659 659 660 + if (radix_tree_preload(GFP_KERNEL)) { 661 + blkg_free(new_blkg); 662 + ret = -ENOMEM; 663 + goto fail; 664 + } 665 + 660 666 rcu_read_lock(); 661 667 spin_lock_irq(&q->queue_lock); 662 668 663 669 blkg = blkg_lookup_check(pos, pol, q); 664 670 if (IS_ERR(blkg)) { 665 671 ret = PTR_ERR(blkg); 666 - goto fail_unlock; 672 + blkg_free(new_blkg); 673 + goto fail_preloaded; 667 674 } 668 675 669 676 if (blkg) { ··· 679 672 blkg = blkg_create(pos, q, new_blkg); 680 673 if (IS_ERR(blkg)) { 681 674 ret = PTR_ERR(blkg); 682 - goto fail_unlock; 675 + goto fail_preloaded; 683 676 } 684 677 } 678 + 679 + radix_tree_preload_end(); 685 680 686 681 if (pos == blkcg) 687 682 goto success; ··· 694 685 ctx->body = input; 695 686 return 0; 696 687 688 + fail_preloaded: 689 + radix_tree_preload_end(); 697 690 fail_unlock: 698 691 spin_unlock_irq(&q->queue_lock); 699 692 rcu_read_unlock();
+1
block/blk-flush.c
··· 225 225 /* release the tag's ownership to the req cloned from */ 226 226 spin_lock_irqsave(&fq->mq_flush_lock, flags); 227 227 228 + WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); 228 229 if (!refcount_dec_and_test(&flush_rq->ref)) { 229 230 fq->rq_status = error; 230 231 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+5 -4
drivers/block/nbd.c
··· 296 296 } 297 297 } 298 298 299 - static void nbd_size_update(struct nbd_device *nbd) 299 + static void nbd_size_update(struct nbd_device *nbd, bool start) 300 300 { 301 301 struct nbd_config *config = nbd->config; 302 302 struct block_device *bdev = bdget_disk(nbd->disk, 0); ··· 313 313 if (bdev) { 314 314 if (bdev->bd_disk) { 315 315 bd_set_nr_sectors(bdev, nr_sectors); 316 - set_blocksize(bdev, config->blksize); 316 + if (start) 317 + set_blocksize(bdev, config->blksize); 317 318 } else 318 319 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 319 320 bdput(bdev); ··· 329 328 config->blksize = blocksize; 330 329 config->bytesize = blocksize * nr_blocks; 331 330 if (nbd->task_recv != NULL) 332 - nbd_size_update(nbd); 331 + nbd_size_update(nbd, false); 333 332 } 334 333 335 334 static void nbd_complete_rq(struct request *req) ··· 1309 1308 args->index = i; 1310 1309 queue_work(nbd->recv_workq, &args->work); 1311 1310 } 1312 - nbd_size_update(nbd); 1311 + nbd_size_update(nbd, true); 1313 1312 return error; 1314 1313 } 1315 1314
+2
drivers/block/null_blk.h
··· 47 47 unsigned int nr_zones_closed; 48 48 struct blk_zone *zones; 49 49 sector_t zone_size_sects; 50 + spinlock_t zone_dev_lock; 51 + unsigned long *zone_locks; 50 52 51 53 unsigned long size; /* device size in MB */ 52 54 unsigned long completion_nsec; /* time in ns to complete a request */
+99 -24
drivers/block/null_blk_zoned.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <linux/vmalloc.h> 3 + #include <linux/bitmap.h> 3 4 #include "null_blk.h" 4 5 5 6 #define CREATE_TRACE_POINTS ··· 45 44 GFP_KERNEL | __GFP_ZERO); 46 45 if (!dev->zones) 47 46 return -ENOMEM; 47 + 48 + spin_lock_init(&dev->zone_dev_lock); 49 + dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); 50 + if (!dev->zone_locks) { 51 + kvfree(dev->zones); 52 + return -ENOMEM; 53 + } 48 54 49 55 if (dev->zone_nr_conv >= dev->nr_zones) { 50 56 dev->zone_nr_conv = dev->nr_zones - 1; ··· 131 123 132 124 void null_free_zoned_dev(struct nullb_device *dev) 133 125 { 126 + bitmap_free(dev->zone_locks); 134 127 kvfree(dev->zones); 128 + } 129 + 130 + static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) 131 + { 132 + wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); 133 + } 134 + 135 + static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) 136 + { 137 + clear_and_wake_up_bit(zno, dev->zone_locks); 135 138 } 136 139 137 140 int null_report_zones(struct gendisk *disk, sector_t sector, ··· 150 131 { 151 132 struct nullb *nullb = disk->private_data; 152 133 struct nullb_device *dev = nullb->dev; 153 - unsigned int first_zone, i; 134 + unsigned int first_zone, i, zno; 154 135 struct blk_zone zone; 155 136 int error; 156 137 ··· 161 142 nr_zones = min(nr_zones, dev->nr_zones - first_zone); 162 143 trace_nullb_report_zones(nullb, nr_zones); 163 144 164 - for (i = 0; i < nr_zones; i++) { 145 + zno = first_zone; 146 + for (i = 0; i < nr_zones; i++, zno++) { 165 147 /* 166 148 * Stacked DM target drivers will remap the zone information by 167 149 * modifying the zone information passed to the report callback. 168 150 * So use a local copy to avoid corruption of the device zone 169 151 * array. 170 152 */ 171 - memcpy(&zone, &dev->zones[first_zone + i], 172 - sizeof(struct blk_zone)); 153 + null_lock_zone(dev, zno); 154 + memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); 155 + null_unlock_zone(dev, zno); 156 + 173 157 error = cb(&zone, i, data); 174 158 if (error) 175 159 return error; ··· 181 159 return nr_zones; 182 160 } 183 161 162 + /* 163 + * This is called in the case of memory backing from null_process_cmd() 164 + * with the target zone already locked. 165 + */ 184 166 size_t null_zone_valid_read_len(struct nullb *nullb, 185 167 sector_t sector, unsigned int len) 186 168 { ··· 321 295 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 322 296 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 323 297 298 + null_lock_zone(dev, zno); 299 + spin_lock(&dev->zone_dev_lock); 300 + 324 301 switch (zone->cond) { 325 302 case BLK_ZONE_COND_FULL: 326 303 /* Cannot write to a full zone */ 327 - return BLK_STS_IOERR; 304 + ret = BLK_STS_IOERR; 305 + goto unlock; 328 306 case BLK_ZONE_COND_EMPTY: 329 307 case BLK_ZONE_COND_CLOSED: 330 308 ret = null_check_zone_resources(dev, zone); 331 309 if (ret != BLK_STS_OK) 332 - return ret; 310 + goto unlock; 333 311 break; 334 312 case BLK_ZONE_COND_IMP_OPEN: 335 313 case BLK_ZONE_COND_EXP_OPEN: 336 314 break; 337 315 default: 338 316 /* Invalid zone condition */ 339 - return BLK_STS_IOERR; 317 + ret = BLK_STS_IOERR; 318 + goto unlock; 340 319 } 341 320 342 321 /* ··· 357 326 else 358 327 cmd->rq->__sector = sector; 359 328 } else if (sector != zone->wp) { 360 - return BLK_STS_IOERR; 329 + ret = BLK_STS_IOERR; 330 + goto unlock; 361 331 } 362 332 363 - if (zone->wp + nr_sectors > zone->start + zone->capacity) 364 - return BLK_STS_IOERR; 333 + if (zone->wp + nr_sectors > zone->start + zone->capacity) { 334 + ret = BLK_STS_IOERR; 335 + goto unlock; 336 + } 365 337 366 338 if (zone->cond == BLK_ZONE_COND_CLOSED) { 367 339 dev->nr_zones_closed--; ··· 375 341 if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 376 342 zone->cond = BLK_ZONE_COND_IMP_OPEN; 377 343 344 + spin_unlock(&dev->zone_dev_lock); 378 345 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 346 + spin_lock(&dev->zone_dev_lock); 379 347 if (ret != BLK_STS_OK) 380 - return ret; 348 + goto unlock; 381 349 382 350 zone->wp += nr_sectors; 383 351 if (zone->wp == zone->start + zone->capacity) { ··· 389 353 dev->nr_zones_imp_open--; 390 354 zone->cond = BLK_ZONE_COND_FULL; 391 355 } 392 - return BLK_STS_OK; 356 + ret = BLK_STS_OK; 357 + 358 + unlock: 359 + spin_unlock(&dev->zone_dev_lock); 360 + null_unlock_zone(dev, zno); 361 + 362 + return ret; 393 363 } 394 364 395 365 static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) ··· 506 464 sector_t sector) 507 465 { 508 466 struct nullb_device *dev = cmd->nq->dev; 509 - unsigned int zone_no = null_zone_no(dev, sector); 510 - struct blk_zone *zone = &dev->zones[zone_no]; 511 - blk_status_t ret = BLK_STS_OK; 467 + unsigned int zone_no; 468 + struct blk_zone *zone; 469 + blk_status_t ret; 512 470 size_t i; 513 471 472 + if (op == REQ_OP_ZONE_RESET_ALL) { 473 + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 474 + null_lock_zone(dev, i); 475 + zone = &dev->zones[i]; 476 + if (zone->cond != BLK_ZONE_COND_EMPTY) { 477 + spin_lock(&dev->zone_dev_lock); 478 + null_reset_zone(dev, zone); 479 + spin_unlock(&dev->zone_dev_lock); 480 + trace_nullb_zone_op(cmd, i, zone->cond); 481 + } 482 + null_unlock_zone(dev, i); 483 + } 484 + return BLK_STS_OK; 485 + } 486 + 487 + zone_no = null_zone_no(dev, sector); 488 + zone = &dev->zones[zone_no]; 489 + 490 + null_lock_zone(dev, zone_no); 491 + spin_lock(&dev->zone_dev_lock); 492 + 514 493 switch (op) { 515 - case REQ_OP_ZONE_RESET_ALL: 516 - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) 517 - null_reset_zone(dev, &dev->zones[i]); 518 - break; 519 494 case REQ_OP_ZONE_RESET: 520 495 ret = null_reset_zone(dev, zone); 521 496 break; ··· 546 487 ret = null_finish_zone(dev, zone); 547 488 break; 548 489 default: 549 - return BLK_STS_NOTSUPP; 490 + ret = BLK_STS_NOTSUPP; 491 + break; 550 492 } 493 + 494 + spin_unlock(&dev->zone_dev_lock); 551 495 552 496 if (ret == BLK_STS_OK) 553 497 trace_nullb_zone_op(cmd, zone_no, zone->cond); 498 + 499 + null_unlock_zone(dev, zone_no); 554 500 555 501 return ret; 556 502 } ··· 563 499 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, 564 500 sector_t sector, sector_t nr_sectors) 565 501 { 502 + struct nullb_device *dev = cmd->nq->dev; 503 + unsigned int zno = null_zone_no(dev, sector); 504 + blk_status_t sts; 505 + 566 506 switch (op) { 567 507 case REQ_OP_WRITE: 568 - return null_zone_write(cmd, sector, nr_sectors, false); 508 + sts = null_zone_write(cmd, sector, nr_sectors, false); 509 + break; 569 510 case REQ_OP_ZONE_APPEND: 570 - return null_zone_write(cmd, sector, nr_sectors, true); 511 + sts = null_zone_write(cmd, sector, nr_sectors, true); 512 + break; 571 513 case REQ_OP_ZONE_RESET: 572 514 case REQ_OP_ZONE_RESET_ALL: 573 515 case REQ_OP_ZONE_OPEN: 574 516 case REQ_OP_ZONE_CLOSE: 575 517 case REQ_OP_ZONE_FINISH: 576 - return null_zone_mgmt(cmd, op, sector); 518 + sts = null_zone_mgmt(cmd, op, sector); 519 + break; 577 520 default: 578 - return null_process_cmd(cmd, op, sector, nr_sectors); 521 + null_lock_zone(dev, zno); 522 + sts = null_process_cmd(cmd, op, sector, nr_sectors); 523 + null_unlock_zone(dev, zno); 579 524 } 525 + 526 + return sts; 580 527 }
+26 -23
drivers/block/xsysace.c
··· 443 443 #define ACE_FSM_NUM_STATES 11 444 444 445 445 /* Set flag to exit FSM loop and reschedule tasklet */ 446 - static inline void ace_fsm_yield(struct ace_device *ace) 446 + static inline void ace_fsm_yieldpoll(struct ace_device *ace) 447 447 { 448 - dev_dbg(ace->dev, "ace_fsm_yield()\n"); 449 448 tasklet_schedule(&ace->fsm_tasklet); 450 449 ace->fsm_continue_flag = 0; 450 + } 451 + 452 + static inline void ace_fsm_yield(struct ace_device *ace) 453 + { 454 + dev_dbg(ace->dev, "%s()\n", __func__); 455 + ace_fsm_yieldpoll(ace); 451 456 } 452 457 453 458 /* Set flag to exit FSM loop and wait for IRQ to reschedule tasklet */ ··· 460 455 { 461 456 dev_dbg(ace->dev, "ace_fsm_yieldirq()\n"); 462 457 463 - if (!ace->irq) 464 - /* No IRQ assigned, so need to poll */ 465 - tasklet_schedule(&ace->fsm_tasklet); 466 - ace->fsm_continue_flag = 0; 458 + if (ace->irq > 0) 459 + ace->fsm_continue_flag = 0; 460 + else 461 + ace_fsm_yieldpoll(ace); 467 462 } 468 463 469 464 static bool ace_has_next_request(struct request_queue *q) ··· 1058 1053 ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ); 1059 1054 1060 1055 /* Now we can hook up the irq handler */ 1061 - if (ace->irq) { 1056 + if (ace->irq > 0) { 1062 1057 rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace); 1063 1058 if (rc) { 1064 1059 /* Failure - fall back to polled mode */ 1065 1060 dev_err(ace->dev, "request_irq failed\n"); 1066 - ace->irq = 0; 1061 + ace->irq = rc; 1067 1062 } 1068 1063 } 1069 1064 ··· 1115 1110 1116 1111 tasklet_kill(&ace->fsm_tasklet); 1117 1112 1118 - if (ace->irq) 1113 + if (ace->irq > 0) 1119 1114 free_irq(ace->irq, ace); 1120 1115 1121 1116 iounmap(ace->baseaddr); ··· 1127 1122 struct ace_device *ace; 1128 1123 int rc; 1129 1124 dev_dbg(dev, "ace_alloc(%p)\n", dev); 1130 - 1131 - if (!physaddr) { 1132 - rc = -ENODEV; 1133 - goto err_noreg; 1134 - } 1135 1125 1136 1126 /* Allocate and initialize the ace device structure */ 1137 1127 ace = kzalloc(sizeof(struct ace_device), GFP_KERNEL); ··· 1153 1153 dev_set_drvdata(dev, NULL); 1154 1154 kfree(ace); 1155 1155 err_alloc: 1156 - err_noreg: 1157 1156 dev_err(dev, "could not initialize device, err=%i\n", rc); 1158 1157 return rc; 1159 1158 } ··· 1175 1176 1176 1177 static int ace_probe(struct platform_device *dev) 1177 1178 { 1178 - resource_size_t physaddr = 0; 1179 1179 int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */ 1180 + resource_size_t physaddr; 1181 + struct resource *res; 1180 1182 u32 id = dev->id; 1181 - int irq = 0; 1183 + int irq; 1182 1184 int i; 1183 1185 1184 1186 dev_dbg(&dev->dev, "ace_probe(%p)\n", dev); ··· 1190 1190 if (of_find_property(dev->dev.of_node, "8-bit", NULL)) 1191 1191 bus_width = ACE_BUS_WIDTH_8; 1192 1192 1193 - for (i = 0; i < dev->num_resources; i++) { 1194 - if (dev->resource[i].flags & IORESOURCE_MEM) 1195 - physaddr = dev->resource[i].start; 1196 - if (dev->resource[i].flags & IORESOURCE_IRQ) 1197 - irq = dev->resource[i].start; 1198 - } 1193 + res = platform_get_resource(dev, IORESOURCE_MEM, 0); 1194 + if (!res) 1195 + return -EINVAL; 1196 + 1197 + physaddr = res->start; 1198 + if (!physaddr) 1199 + return -ENODEV; 1200 + 1201 + irq = platform_get_irq_optional(dev, 0); 1199 1202 1200 1203 /* Call the bus-independent setup code */ 1201 1204 return ace_alloc(&dev->dev, id, physaddr, irq, bus_width);
+1 -1
drivers/nvme/host/core.c
··· 2125 2125 2126 2126 if (blk_queue_is_zoned(ns->queue)) { 2127 2127 ret = nvme_revalidate_zones(ns); 2128 - if (ret) 2128 + if (ret && !nvme_first_scan(ns->disk)) 2129 2129 return ret; 2130 2130 } 2131 2131
+111 -163
drivers/nvme/host/fc.c
··· 146 146 147 147 /* fc_ctrl flags values - specified as bit positions */ 148 148 #define ASSOC_ACTIVE 0 149 - #define FCCTRL_TERMIO 1 149 + #define ASSOC_FAILED 1 150 + #define FCCTRL_TERMIO 2 150 151 151 152 struct nvme_fc_ctrl { 152 153 spinlock_t lock; ··· 158 157 u32 cnum; 159 158 160 159 bool ioq_live; 161 - atomic_t err_work_active; 162 160 u64 association_id; 163 161 struct nvmefc_ls_rcv_op *rcv_disconn; 164 162 ··· 167 167 struct blk_mq_tag_set tag_set; 168 168 169 169 struct delayed_work connect_work; 170 - struct work_struct err_work; 171 170 172 171 struct kref ref; 173 172 unsigned long flags; ··· 2413 2414 nvme_fc_ctrl_put(ctrl); 2414 2415 } 2415 2416 2417 + /* 2418 + * This routine is used by the transport when it needs to find active 2419 + * io on a queue that is to be terminated. The transport uses 2420 + * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke 2421 + * this routine to kill them on a 1 by 1 basis. 2422 + * 2423 + * As FC allocates FC exchange for each io, the transport must contact 2424 + * the LLDD to terminate the exchange, thus releasing the FC exchange. 2425 + * After terminating the exchange the LLDD will call the transport's 2426 + * normal io done path for the request, but it will have an aborted 2427 + * status. The done path will return the io request back to the block 2428 + * layer with an error status. 2429 + */ 2430 + static bool 2431 + nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) 2432 + { 2433 + struct nvme_ctrl *nctrl = data; 2434 + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); 2435 + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); 2436 + 2437 + __nvme_fc_abort_op(ctrl, op); 2438 + return true; 2439 + } 2440 + 2441 + /* 2442 + * This routine runs through all outstanding commands on the association 2443 + * and aborts them. This routine is typically be called by the 2444 + * delete_association routine. It is also called due to an error during 2445 + * reconnect. In that scenario, it is most likely a command that initializes 2446 + * the controller, including fabric Connect commands on io queues, that 2447 + * may have timed out or failed thus the io must be killed for the connect 2448 + * thread to see the error. 2449 + */ 2450 + static void 2451 + __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) 2452 + { 2453 + /* 2454 + * If io queues are present, stop them and terminate all outstanding 2455 + * ios on them. As FC allocates FC exchange for each io, the 2456 + * transport must contact the LLDD to terminate the exchange, 2457 + * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() 2458 + * to tell us what io's are busy and invoke a transport routine 2459 + * to kill them with the LLDD. After terminating the exchange 2460 + * the LLDD will call the transport's normal io done path, but it 2461 + * will have an aborted status. The done path will return the 2462 + * io requests back to the block layer as part of normal completions 2463 + * (but with error status). 2464 + */ 2465 + if (ctrl->ctrl.queue_count > 1) { 2466 + nvme_stop_queues(&ctrl->ctrl); 2467 + blk_mq_tagset_busy_iter(&ctrl->tag_set, 2468 + nvme_fc_terminate_exchange, &ctrl->ctrl); 2469 + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); 2470 + if (start_queues) 2471 + nvme_start_queues(&ctrl->ctrl); 2472 + } 2473 + 2474 + /* 2475 + * Other transports, which don't have link-level contexts bound 2476 + * to sqe's, would try to gracefully shutdown the controller by 2477 + * writing the registers for shutdown and polling (call 2478 + * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially 2479 + * just aborted and we will wait on those contexts, and given 2480 + * there was no indication of how live the controlelr is on the 2481 + * link, don't send more io to create more contexts for the 2482 + * shutdown. Let the controller fail via keepalive failure if 2483 + * its still present. 2484 + */ 2485 + 2486 + /* 2487 + * clean up the admin queue. Same thing as above. 2488 + */ 2489 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 2490 + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 2491 + nvme_fc_terminate_exchange, &ctrl->ctrl); 2492 + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); 2493 + } 2494 + 2416 2495 static void 2417 2496 nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) 2418 2497 { 2419 - int active; 2420 - 2421 2498 /* 2422 - * if an error (io timeout, etc) while (re)connecting, 2423 - * it's an error on creating the new association. 2424 - * Start the error recovery thread if it hasn't already 2425 - * been started. It is expected there could be multiple 2426 - * ios hitting this path before things are cleaned up. 2499 + * if an error (io timeout, etc) while (re)connecting, the remote 2500 + * port requested terminating of the association (disconnect_ls) 2501 + * or an error (timeout or abort) occurred on an io while creating 2502 + * the controller. Abort any ios on the association and let the 2503 + * create_association error path resolve things. 2427 2504 */ 2428 2505 if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { 2429 - active = atomic_xchg(&ctrl->err_work_active, 1); 2430 - if (!active && !queue_work(nvme_fc_wq, &ctrl->err_work)) { 2431 - atomic_set(&ctrl->err_work_active, 0); 2432 - WARN_ON(1); 2433 - } 2506 + __nvme_fc_abort_outstanding_ios(ctrl, true); 2507 + set_bit(ASSOC_FAILED, &ctrl->flags); 2434 2508 return; 2435 2509 } 2436 2510 ··· 2817 2745 nvme_fc_ctrl_put(ctrl); 2818 2746 } 2819 2747 2820 - /* 2821 - * This routine is used by the transport when it needs to find active 2822 - * io on a queue that is to be terminated. The transport uses 2823 - * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke 2824 - * this routine to kill them on a 1 by 1 basis. 2825 - * 2826 - * As FC allocates FC exchange for each io, the transport must contact 2827 - * the LLDD to terminate the exchange, thus releasing the FC exchange. 2828 - * After terminating the exchange the LLDD will call the transport's 2829 - * normal io done path for the request, but it will have an aborted 2830 - * status. The done path will return the io request back to the block 2831 - * layer with an error status. 2832 - */ 2833 - static bool 2834 - nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) 2835 - { 2836 - struct nvme_ctrl *nctrl = data; 2837 - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); 2838 - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); 2839 - 2840 - __nvme_fc_abort_op(ctrl, op); 2841 - return true; 2842 - } 2843 - 2844 2748 2845 2749 static const struct blk_mq_ops nvme_fc_mq_ops = { 2846 2750 .queue_rq = nvme_fc_queue_rq, ··· 3036 2988 ctrl->cnum, ctrl->lport->localport.port_name, 3037 2989 ctrl->rport->remoteport.port_name, ctrl->ctrl.opts->subsysnqn); 3038 2990 2991 + clear_bit(ASSOC_FAILED, &ctrl->flags); 2992 + 3039 2993 /* 3040 2994 * Create the admin queue 3041 2995 */ ··· 3066 3016 */ 3067 3017 3068 3018 ret = nvme_enable_ctrl(&ctrl->ctrl); 3069 - if (ret) 3019 + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) 3070 3020 goto out_disconnect_admin_queue; 3071 3021 3072 3022 ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; ··· 3076 3026 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 3077 3027 3078 3028 ret = nvme_init_identify(&ctrl->ctrl); 3079 - if (ret) 3029 + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) 3080 3030 goto out_disconnect_admin_queue; 3081 3031 3082 3032 /* sanity checks */ ··· 3121 3071 ret = nvme_fc_create_io_queues(ctrl); 3122 3072 else 3123 3073 ret = nvme_fc_recreate_io_queues(ctrl); 3124 - if (ret) 3125 - goto out_term_aen_ops; 3126 3074 } 3075 + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) 3076 + goto out_term_aen_ops; 3127 3077 3128 3078 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 3129 3079 ··· 3156 3106 return ret; 3157 3107 } 3158 3108 3159 - 3160 - /* 3161 - * This routine runs through all outstanding commands on the association 3162 - * and aborts them. This routine is typically be called by the 3163 - * delete_association routine. It is also called due to an error during 3164 - * reconnect. In that scenario, it is most likely a command that initializes 3165 - * the controller, including fabric Connect commands on io queues, that 3166 - * may have timed out or failed thus the io must be killed for the connect 3167 - * thread to see the error. 3168 - */ 3169 - static void 3170 - __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) 3171 - { 3172 - /* 3173 - * If io queues are present, stop them and terminate all outstanding 3174 - * ios on them. As FC allocates FC exchange for each io, the 3175 - * transport must contact the LLDD to terminate the exchange, 3176 - * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() 3177 - * to tell us what io's are busy and invoke a transport routine 3178 - * to kill them with the LLDD. After terminating the exchange 3179 - * the LLDD will call the transport's normal io done path, but it 3180 - * will have an aborted status. The done path will return the 3181 - * io requests back to the block layer as part of normal completions 3182 - * (but with error status). 3183 - */ 3184 - if (ctrl->ctrl.queue_count > 1) { 3185 - nvme_stop_queues(&ctrl->ctrl); 3186 - blk_mq_tagset_busy_iter(&ctrl->tag_set, 3187 - nvme_fc_terminate_exchange, &ctrl->ctrl); 3188 - blk_mq_tagset_wait_completed_request(&ctrl->tag_set); 3189 - if (start_queues) 3190 - nvme_start_queues(&ctrl->ctrl); 3191 - } 3192 - 3193 - /* 3194 - * Other transports, which don't have link-level contexts bound 3195 - * to sqe's, would try to gracefully shutdown the controller by 3196 - * writing the registers for shutdown and polling (call 3197 - * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially 3198 - * just aborted and we will wait on those contexts, and given 3199 - * there was no indication of how live the controlelr is on the 3200 - * link, don't send more io to create more contexts for the 3201 - * shutdown. Let the controller fail via keepalive failure if 3202 - * its still present. 3203 - */ 3204 - 3205 - /* 3206 - * clean up the admin queue. Same thing as above. 3207 - */ 3208 - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 3209 - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 3210 - nvme_fc_terminate_exchange, &ctrl->ctrl); 3211 - blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); 3212 - } 3213 3109 3214 3110 /* 3215 3111 * This routine stops operation of the controller on the host side. ··· 3233 3237 { 3234 3238 struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); 3235 3239 3236 - cancel_work_sync(&ctrl->err_work); 3237 3240 cancel_delayed_work_sync(&ctrl->connect_work); 3238 3241 /* 3239 3242 * kill the association on the link side. this will block ··· 3287 3292 } 3288 3293 3289 3294 static void 3290 - __nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl) 3291 - { 3292 - /* 3293 - * if state is CONNECTING - the error occurred as part of a 3294 - * reconnect attempt. Abort any ios on the association and 3295 - * let the create_association error paths resolve things. 3296 - */ 3297 - if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { 3298 - __nvme_fc_abort_outstanding_ios(ctrl, true); 3299 - return; 3300 - } 3301 - 3302 - /* 3303 - * For any other state, kill the association. As this routine 3304 - * is a common io abort routine for resetting and such, after 3305 - * the association is terminated, ensure that the state is set 3306 - * to CONNECTING. 3307 - */ 3308 - 3309 - nvme_stop_keep_alive(&ctrl->ctrl); 3310 - 3311 - /* will block will waiting for io to terminate */ 3312 - nvme_fc_delete_association(ctrl); 3313 - 3314 - if (ctrl->ctrl.state != NVME_CTRL_CONNECTING && 3315 - !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) 3316 - dev_err(ctrl->ctrl.device, 3317 - "NVME-FC{%d}: error_recovery: Couldn't change state " 3318 - "to CONNECTING\n", ctrl->cnum); 3319 - } 3320 - 3321 - static void 3322 3295 nvme_fc_reset_ctrl_work(struct work_struct *work) 3323 3296 { 3324 3297 struct nvme_fc_ctrl *ctrl = 3325 3298 container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); 3326 - int ret; 3327 - 3328 - __nvme_fc_terminate_io(ctrl); 3329 3299 3330 3300 nvme_stop_ctrl(&ctrl->ctrl); 3331 3301 3332 - if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) 3333 - ret = nvme_fc_create_association(ctrl); 3334 - else 3335 - ret = -ENOTCONN; 3302 + /* will block will waiting for io to terminate */ 3303 + nvme_fc_delete_association(ctrl); 3336 3304 3337 - if (ret) 3338 - nvme_fc_reconnect_or_delete(ctrl, ret); 3339 - else 3340 - dev_info(ctrl->ctrl.device, 3341 - "NVME-FC{%d}: controller reset complete\n", 3342 - ctrl->cnum); 3305 + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) 3306 + dev_err(ctrl->ctrl.device, 3307 + "NVME-FC{%d}: error_recovery: Couldn't change state " 3308 + "to CONNECTING\n", ctrl->cnum); 3309 + 3310 + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { 3311 + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { 3312 + dev_err(ctrl->ctrl.device, 3313 + "NVME-FC{%d}: failed to schedule connect " 3314 + "after reset\n", ctrl->cnum); 3315 + } else { 3316 + flush_delayed_work(&ctrl->connect_work); 3317 + } 3318 + } else { 3319 + nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN); 3320 + } 3343 3321 } 3344 3322 3345 - static void 3346 - nvme_fc_connect_err_work(struct work_struct *work) 3347 - { 3348 - struct nvme_fc_ctrl *ctrl = 3349 - container_of(work, struct nvme_fc_ctrl, err_work); 3350 - 3351 - __nvme_fc_terminate_io(ctrl); 3352 - 3353 - atomic_set(&ctrl->err_work_active, 0); 3354 - 3355 - /* 3356 - * Rescheduling the connection after recovering 3357 - * from the io error is left to the reconnect work 3358 - * item, which is what should have stalled waiting on 3359 - * the io that had the error that scheduled this work. 3360 - */ 3361 - } 3362 3323 3363 3324 static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { 3364 3325 .name = "fc", ··· 3442 3491 ctrl->dev = lport->dev; 3443 3492 ctrl->cnum = idx; 3444 3493 ctrl->ioq_live = false; 3445 - atomic_set(&ctrl->err_work_active, 0); 3446 3494 init_waitqueue_head(&ctrl->ioabort_wait); 3447 3495 3448 3496 get_device(ctrl->dev); ··· 3449 3499 3450 3500 INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); 3451 3501 INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); 3452 - INIT_WORK(&ctrl->err_work, nvme_fc_connect_err_work); 3453 3502 spin_lock_init(&ctrl->lock); 3454 3503 3455 3504 /* io queue count */ ··· 3541 3592 fail_ctrl: 3542 3593 nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); 3543 3594 cancel_work_sync(&ctrl->ctrl.reset_work); 3544 - cancel_work_sync(&ctrl->err_work); 3545 3595 cancel_delayed_work_sync(&ctrl->connect_work); 3546 3596 3547 3597 ctrl->ctrl.opts = NULL;
+8
drivers/nvme/host/rdma.c
··· 1768 1768 return; 1769 1769 } 1770 1770 1771 + /* sanity checking for received data length */ 1772 + if (unlikely(wc->byte_len < len)) { 1773 + dev_err(queue->ctrl->ctrl.device, 1774 + "Unexpected nvme completion length(%d)\n", wc->byte_len); 1775 + nvme_rdma_error_recovery(queue->ctrl); 1776 + return; 1777 + } 1778 + 1771 1779 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); 1772 1780 /* 1773 1781 * AEN requests are special as they don't time out and can
+2 -2
drivers/nvme/target/core.c
··· 907 907 req->error_loc = NVMET_NO_ERROR_LOC; 908 908 req->error_slba = 0; 909 909 910 - trace_nvmet_req_init(req, req->cmd); 911 - 912 910 /* no support for fused commands yet */ 913 911 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { 914 912 req->error_loc = offsetof(struct nvme_common_command, flags); ··· 935 937 936 938 if (status) 937 939 goto fail; 940 + 941 + trace_nvmet_req_init(req, req->cmd); 938 942 939 943 if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { 940 944 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+7 -14
drivers/nvme/target/trace.h
··· 46 46 return req->sq->ctrl; 47 47 } 48 48 49 - static inline void __assign_disk_name(char *name, struct nvmet_req *req, 50 - bool init) 49 + static inline void __assign_req_name(char *name, struct nvmet_req *req) 51 50 { 52 - struct nvmet_ctrl *ctrl = nvmet_req_to_ctrl(req); 53 - struct nvmet_ns *ns; 54 - 55 - if ((init && req->sq->qid) || (!init && req->cq->qid)) { 56 - ns = nvmet_find_namespace(ctrl, req->cmd->rw.nsid); 57 - strncpy(name, ns->device_path, DISK_NAME_LEN); 58 - return; 59 - } 60 - 61 - memset(name, 0, DISK_NAME_LEN); 51 + if (req->ns) 52 + strncpy(name, req->ns->device_path, DISK_NAME_LEN); 53 + else 54 + memset(name, 0, DISK_NAME_LEN); 62 55 } 63 56 #endif 64 57 ··· 74 81 TP_fast_assign( 75 82 __entry->cmd = cmd; 76 83 __entry->ctrl = nvmet_req_to_ctrl(req); 77 - __assign_disk_name(__entry->disk, req, true); 84 + __assign_req_name(__entry->disk, req); 78 85 __entry->qid = req->sq->qid; 79 86 __entry->cid = cmd->common.command_id; 80 87 __entry->opcode = cmd->common.opcode; ··· 114 121 __entry->cid = req->cqe->command_id; 115 122 __entry->result = le64_to_cpu(req->cqe->result.u64); 116 123 __entry->status = le16_to_cpu(req->cqe->status) >> 1; 117 - __assign_disk_name(__entry->disk, req, false); 124 + __assign_req_name(__entry->disk, req); 118 125 ), 119 126 TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x", 120 127 __print_ctrl_name(__entry->ctrl),
+1 -1
lib/scatterlist.c
··· 933 933 sg_miter_start(&miter, sgl, nents, sg_flags); 934 934 935 935 if (!sg_miter_skip(&miter, skip)) 936 - return false; 936 + return 0; 937 937 938 938 while ((offset < buflen) && sg_miter_next(&miter)) { 939 939 unsigned int len;