Merge branch 'for-linus' of git://git.kernel.dk/linux-block

+2 -4

Documentation/block/data-integrity.txt

··· 192 192 supported by the block device. 193 193 194 194 195 - int bio_integrity_prep(bio); 195 + bool bio_integrity_prep(bio); 196 196 197 197 To generate IMD for WRITE and to set up buffers for READ, the 198 198 filesystem must call bio_integrity_prep(bio). ··· 201 201 sector must be set, and the bio should have all data pages 202 202 added. It is up to the caller to ensure that the bio does not 203 203 change while I/O is in progress. 204 - 205 - bio_integrity_prep() should only be called if 206 - bio_integrity_enabled() returned 1. 204 + Complete bio with error if prepare failed for some reson. 207 205 208 206 209 207 5.3 PASSING EXISTING INTEGRITY METADATA

+10 -4

block/bfq-iosched.c

··· 3483 3483 } 3484 3484 } 3485 3485 } 3486 - /* Update weight both if it must be raised and if it must be lowered */ 3486 + /* 3487 + * To improve latency (for this or other queues), immediately 3488 + * update weight both if it must be raised and if it must be 3489 + * lowered. Since, entity may be on some active tree here, and 3490 + * might have a pending change of its ioprio class, invoke 3491 + * next function with the last parameter unset (see the 3492 + * comments on the function). 3493 + */ 3487 3494 if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) 3488 - __bfq_entity_update_weight_prio( 3489 - bfq_entity_service_tree(entity), 3490 - entity); 3495 + __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), 3496 + entity, false); 3491 3497 } 3492 3498 3493 3499 /*

+2 -1

block/bfq-iosched.h

··· 892 892 struct bfq_entity *entity); 893 893 struct bfq_service_tree * 894 894 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, 895 - struct bfq_entity *entity); 895 + struct bfq_entity *entity, 896 + bool update_class_too); 896 897 void bfq_bfqq_served(struct bfq_queue *bfqq, int served); 897 898 void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, 898 899 unsigned long time_ms);

+34 -5

block/bfq-wf2q.c

··· 694 694 return sched_data->service_tree + idx; 695 695 } 696 696 697 - 697 + /* 698 + * Update weight and priority of entity. If update_class_too is true, 699 + * then update the ioprio_class of entity too. 700 + * 701 + * The reason why the update of ioprio_class is controlled through the 702 + * last parameter is as follows. Changing the ioprio class of an 703 + * entity implies changing the destination service trees for that 704 + * entity. If such a change occurred when the entity is already on one 705 + * of the service trees for its previous class, then the state of the 706 + * entity would become more complex: none of the new possible service 707 + * trees for the entity, according to bfq_entity_service_tree(), would 708 + * match any of the possible service trees on which the entity 709 + * is. Complex operations involving these trees, such as entity 710 + * activations and deactivations, should take into account this 711 + * additional complexity. To avoid this issue, this function is 712 + * invoked with update_class_too unset in the points in the code where 713 + * entity may happen to be on some tree. 714 + */ 698 715 struct bfq_service_tree * 699 716 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, 700 - struct bfq_entity *entity) 717 + struct bfq_entity *entity, 718 + bool update_class_too) 701 719 { 702 720 struct bfq_service_tree *new_st = old_st; 703 721 ··· 757 739 bfq_weight_to_ioprio(entity->orig_weight); 758 740 } 759 741 760 - if (bfqq) 742 + if (bfqq && update_class_too) 761 743 bfqq->ioprio_class = bfqq->new_ioprio_class; 762 - entity->prio_changed = 0; 744 + 745 + /* 746 + * Reset prio_changed only if the ioprio_class change 747 + * is not pending any longer. 748 + */ 749 + if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) 750 + entity->prio_changed = 0; 763 751 764 752 /* 765 753 * NOTE: here we may be changing the weight too early, ··· 891 867 { 892 868 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 893 869 894 - st = __bfq_entity_update_weight_prio(st, entity); 870 + /* 871 + * When this function is invoked, entity is not in any service 872 + * tree, then it is safe to invoke next function with the last 873 + * parameter set (see the comments on the function). 874 + */ 875 + st = __bfq_entity_update_weight_prio(st, entity, true); 895 876 bfq_calc_finish(entity, entity->budget); 896 877 897 878 /*

+75 -92

block/bio-integrity.c

··· 102 102 * Description: Used to free the integrity portion of a bio. Usually 103 103 * called from bio_free(). 104 104 */ 105 - void bio_integrity_free(struct bio *bio) 105 + static void bio_integrity_free(struct bio *bio) 106 106 { 107 107 struct bio_integrity_payload *bip = bio_integrity(bio); 108 108 struct bio_set *bs = bio->bi_pool; ··· 120 120 } 121 121 122 122 bio->bi_integrity = NULL; 123 + bio->bi_opf &= ~REQ_INTEGRITY; 123 124 } 124 - EXPORT_SYMBOL(bio_integrity_free); 125 125 126 126 /** 127 127 * bio_integrity_add_page - Attach integrity metadata ··· 160 160 EXPORT_SYMBOL(bio_integrity_add_page); 161 161 162 162 /** 163 - * bio_integrity_enabled - Check whether integrity can be passed 164 - * @bio: bio to check 165 - * 166 - * Description: Determines whether bio_integrity_prep() can be called 167 - * on this bio or not. bio data direction and target device must be 168 - * set prior to calling. The functions honors the write_generate and 169 - * read_verify flags in sysfs. 170 - */ 171 - bool bio_integrity_enabled(struct bio *bio) 172 - { 173 - struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 174 - 175 - if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) 176 - return false; 177 - 178 - if (!bio_sectors(bio)) 179 - return false; 180 - 181 - /* Already protected? */ 182 - if (bio_integrity(bio)) 183 - return false; 184 - 185 - if (bi == NULL) 186 - return false; 187 - 188 - if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL && 189 - (bi->flags & BLK_INTEGRITY_VERIFY)) 190 - return true; 191 - 192 - if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL && 193 - (bi->flags & BLK_INTEGRITY_GENERATE)) 194 - return true; 195 - 196 - return false; 197 - } 198 - EXPORT_SYMBOL(bio_integrity_enabled); 199 - 200 - /** 201 163 * bio_integrity_intervals - Return number of integrity intervals for a bio 202 164 * @bi: blk_integrity profile for device 203 165 * @sectors: Size of the bio in 512-byte sectors ··· 184 222 /** 185 223 * bio_integrity_process - Process integrity metadata for a bio 186 224 * @bio: bio to generate/verify integrity metadata for 225 + * @proc_iter: iterator to process 187 226 * @proc_fn: Pointer to the relevant processing function 188 227 */ 189 228 static blk_status_t bio_integrity_process(struct bio *bio, 190 - integrity_processing_fn *proc_fn) 229 + struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) 191 230 { 192 231 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 193 232 struct blk_integrity_iter iter; ··· 201 238 202 239 iter.disk_name = bio->bi_bdev->bd_disk->disk_name; 203 240 iter.interval = 1 << bi->interval_exp; 204 - iter.seed = bip_get_seed(bip); 241 + iter.seed = proc_iter->bi_sector; 205 242 iter.prot_buf = prot_buf; 206 243 207 - bio_for_each_segment(bv, bio, bviter) { 244 + __bio_for_each_segment(bv, bio, bviter, *proc_iter) { 208 245 void *kaddr = kmap_atomic(bv.bv_page); 209 246 210 247 iter.data_buf = kaddr + bv.bv_offset; ··· 225 262 * bio_integrity_prep - Prepare bio for integrity I/O 226 263 * @bio: bio to prepare 227 264 * 228 - * Description: Allocates a buffer for integrity metadata, maps the 229 - * pages and attaches them to a bio. The bio must have data 230 - * direction, target device and start sector set priot to calling. In 231 - * the WRITE case, integrity metadata will be generated using the 232 - * block device's integrity function. In the READ case, the buffer 265 + * Description: Checks if the bio already has an integrity payload attached. 266 + * If it does, the payload has been generated by another kernel subsystem, 267 + * and we just pass it through. Otherwise allocates integrity payload. 268 + * The bio must have data direction, target device and start sector set priot 269 + * to calling. In the WRITE case, integrity metadata will be generated using 270 + * the block device's integrity function. In the READ case, the buffer 233 271 * will be prepared for DMA and a suitable end_io handler set up. 234 272 */ 235 - int bio_integrity_prep(struct bio *bio) 273 + bool bio_integrity_prep(struct bio *bio) 236 274 { 237 275 struct bio_integrity_payload *bip; 238 276 struct blk_integrity *bi; ··· 243 279 unsigned int len, nr_pages; 244 280 unsigned int bytes, offset, i; 245 281 unsigned int intervals; 282 + blk_status_t status; 246 283 247 284 bi = bdev_get_integrity(bio->bi_bdev); 248 285 q = bdev_get_queue(bio->bi_bdev); 249 - BUG_ON(bi == NULL); 250 - BUG_ON(bio_integrity(bio)); 286 + if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) 287 + return true; 251 288 289 + if (!bio_sectors(bio)) 290 + return true; 291 + 292 + /* Already protected? */ 293 + if (bio_integrity(bio)) 294 + return true; 295 + 296 + if (bi == NULL) 297 + return true; 298 + 299 + if (bio_data_dir(bio) == READ) { 300 + if (!bi->profile->verify_fn || 301 + !(bi->flags & BLK_INTEGRITY_VERIFY)) 302 + return true; 303 + } else { 304 + if (!bi->profile->generate_fn || 305 + !(bi->flags & BLK_INTEGRITY_GENERATE)) 306 + return true; 307 + } 252 308 intervals = bio_integrity_intervals(bi, bio_sectors(bio)); 253 309 254 310 /* Allocate kernel buffer for protection data */ 255 311 len = intervals * bi->tuple_size; 256 312 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); 313 + status = BLK_STS_RESOURCE; 257 314 if (unlikely(buf == NULL)) { 258 315 printk(KERN_ERR "could not allocate integrity buffer\n"); 259 - return -ENOMEM; 316 + goto err_end_io; 260 317 } 261 318 262 319 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; ··· 289 304 if (IS_ERR(bip)) { 290 305 printk(KERN_ERR "could not allocate data integrity bioset\n"); 291 306 kfree(buf); 292 - return PTR_ERR(bip); 307 + status = BLK_STS_RESOURCE; 308 + goto err_end_io; 293 309 } 294 310 295 311 bip->bip_flags |= BIP_BLOCK_INTEGRITY; ··· 316 330 bytes, offset); 317 331 318 332 if (ret == 0) 319 - return 0; 333 + return false; 320 334 321 335 if (ret < bytes) 322 336 break; ··· 326 340 offset = 0; 327 341 } 328 342 329 - /* Install custom I/O completion handler if read verify is enabled */ 330 - if (bio_data_dir(bio) == READ) { 331 - bip->bip_end_io = bio->bi_end_io; 332 - bio->bi_end_io = bio_integrity_endio; 333 - } 334 - 335 343 /* Auto-generate integrity metadata if this is a write */ 336 - if (bio_data_dir(bio) == WRITE) 337 - bio_integrity_process(bio, bi->profile->generate_fn); 344 + if (bio_data_dir(bio) == WRITE) { 345 + bio_integrity_process(bio, &bio->bi_iter, 346 + bi->profile->generate_fn); 347 + } 348 + return true; 338 349 339 - return 0; 350 + err_end_io: 351 + bio->bi_status = status; 352 + bio_endio(bio); 353 + return false; 354 + 340 355 } 341 356 EXPORT_SYMBOL(bio_integrity_prep); 342 357 ··· 355 368 container_of(work, struct bio_integrity_payload, bip_work); 356 369 struct bio *bio = bip->bip_bio; 357 370 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 371 + struct bvec_iter iter = bio->bi_iter; 358 372 359 - bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn); 373 + /* 374 + * At the moment verify is called bio's iterator was advanced 375 + * during split and completion, we need to rewind iterator to 376 + * it's original position. 377 + */ 378 + if (bio_rewind_iter(bio, &iter, iter.bi_done)) { 379 + bio->bi_status = bio_integrity_process(bio, &iter, 380 + bi->profile->verify_fn); 381 + } else { 382 + bio->bi_status = BLK_STS_IOERR; 383 + } 360 384 361 - /* Restore original bio completion handler */ 362 - bio->bi_end_io = bip->bip_end_io; 385 + bio_integrity_free(bio); 363 386 bio_endio(bio); 364 387 } 365 388 366 389 /** 367 - * bio_integrity_endio - Integrity I/O completion function 390 + * __bio_integrity_endio - Integrity I/O completion function 368 391 * @bio: Protected bio 369 392 * @error: Pointer to errno 370 393 * ··· 385 388 * in process context. This function postpones completion 386 389 * accordingly. 387 390 */ 388 - void bio_integrity_endio(struct bio *bio) 391 + bool __bio_integrity_endio(struct bio *bio) 389 392 { 390 - struct bio_integrity_payload *bip = bio_integrity(bio); 393 + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status) { 394 + struct bio_integrity_payload *bip = bio_integrity(bio); 391 395 392 - BUG_ON(bip->bip_bio != bio); 393 - 394 - /* In case of an I/O error there is no point in verifying the 395 - * integrity metadata. Restore original bio end_io handler 396 - * and run it. 397 - */ 398 - if (bio->bi_status) { 399 - bio->bi_end_io = bip->bip_end_io; 400 - bio_endio(bio); 401 - 402 - return; 396 + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); 397 + queue_work(kintegrityd_wq, &bip->bip_work); 398 + return false; 403 399 } 404 400 405 - INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); 406 - queue_work(kintegrityd_wq, &bip->bip_work); 401 + bio_integrity_free(bio); 402 + return true; 407 403 } 408 - EXPORT_SYMBOL(bio_integrity_endio); 409 404 410 405 /** 411 406 * bio_integrity_advance - Advance integrity vector ··· 414 425 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 415 426 unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); 416 427 428 + bip->bip_iter.bi_sector += bytes_done >> 9; 417 429 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); 418 430 } 419 431 EXPORT_SYMBOL(bio_integrity_advance); ··· 422 432 /** 423 433 * bio_integrity_trim - Trim integrity vector 424 434 * @bio: bio whose integrity vector to update 425 - * @offset: offset to first data sector 426 - * @sectors: number of data sectors 427 435 * 428 436 * Description: Used to trim the integrity vector in a cloned bio. 429 - * The ivec will be advanced corresponding to 'offset' data sectors 430 - * and the length will be truncated corresponding to 'len' data 431 - * sectors. 432 437 */ 433 - void bio_integrity_trim(struct bio *bio, unsigned int offset, 434 - unsigned int sectors) 438 + void bio_integrity_trim(struct bio *bio) 435 439 { 436 440 struct bio_integrity_payload *bip = bio_integrity(bio); 437 441 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 438 442 439 - bio_integrity_advance(bio, offset << 9); 440 - bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors); 443 + bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); 441 444 } 442 445 EXPORT_SYMBOL(bio_integrity_trim); 443 446

+9 -4

block/bio.c

··· 243 243 void bio_uninit(struct bio *bio) 244 244 { 245 245 bio_disassociate_task(bio); 246 - 247 - if (bio_integrity(bio)) 248 - bio_integrity_free(bio); 249 246 } 250 247 EXPORT_SYMBOL(bio_uninit); 251 248 ··· 1810 1813 again: 1811 1814 if (!bio_remaining_done(bio)) 1812 1815 return; 1816 + if (!bio_integrity_endio(bio)) 1817 + return; 1813 1818 1814 1819 /* 1815 1820 * Need to have a real endio function for chained bios, otherwise ··· 1833 1834 } 1834 1835 1835 1836 blk_throtl_bio_endio(bio); 1837 + /* release cgroup info */ 1838 + bio_uninit(bio); 1836 1839 if (bio->bi_end_io) 1837 1840 bio->bi_end_io(bio); 1838 1841 } ··· 1869 1868 split->bi_iter.bi_size = sectors << 9; 1870 1869 1871 1870 if (bio_integrity(split)) 1872 - bio_integrity_trim(split, 0, sectors); 1871 + bio_integrity_trim(split); 1873 1872 1874 1873 bio_advance(bio, split->bi_iter.bi_size); 1875 1874 ··· 1901 1900 bio_advance(bio, offset << 9); 1902 1901 1903 1902 bio->bi_iter.bi_size = size; 1903 + 1904 + if (bio_integrity(bio)) 1905 + bio_integrity_trim(bio); 1906 + 1904 1907 } 1905 1908 EXPORT_SYMBOL_GPL(bio_trim); 1906 1909

+1 -4

block/blk-core.c

··· 1787 1787 1788 1788 blk_queue_split(q, &bio); 1789 1789 1790 - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1791 - bio->bi_status = BLK_STS_IOERR; 1792 - bio_endio(bio); 1790 + if (!bio_integrity_prep(bio)) 1793 1791 return BLK_QC_T_NONE; 1794 - } 1795 1792 1796 1793 if (op_is_flush(bio->bi_opf)) { 1797 1794 spin_lock_irq(q->queue_lock);

+18 -5

block/blk-lib.c

··· 261 261 return 0; 262 262 } 263 263 264 + /* 265 + * Convert a number of 512B sectors to a number of pages. 266 + * The result is limited to a number of pages that can fit into a BIO. 267 + * Also make sure that the result is always at least 1 (page) for the cases 268 + * where nr_sects is lower than the number of sectors in a page. 269 + */ 270 + static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) 271 + { 272 + sector_t bytes = (nr_sects << 9) + PAGE_SIZE - 1; 273 + 274 + return min(bytes >> PAGE_SHIFT, (sector_t)BIO_MAX_PAGES); 275 + } 276 + 264 277 /** 265 278 * __blkdev_issue_zeroout - generate number of zero filed write bios 266 279 * @bdev: blockdev to issue ··· 320 307 321 308 ret = 0; 322 309 while (nr_sects != 0) { 323 - bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), 324 - gfp_mask); 310 + bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), 311 + gfp_mask); 325 312 bio->bi_iter.bi_sector = sector; 326 313 bio->bi_bdev = bdev; 327 314 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 328 315 329 316 while (nr_sects != 0) { 330 - sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 331 - bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); 317 + sz = min((sector_t) PAGE_SIZE, nr_sects << 9); 318 + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); 332 319 nr_sects -= bi_size >> 9; 333 320 sector += bi_size >> 9; 334 - if (bi_size < (sz << 9)) 321 + if (bi_size < sz) 335 322 break; 336 323 } 337 324 cond_resched();

+5 -3

block/blk-mq-sched.c

··· 515 515 } 516 516 517 517 /* 518 - * Default to 256, since we don't split into sync/async like the 519 - * old code did. Additionally, this is a per-hw queue depth. 518 + * Default to double of smaller one between hw queue_depth and 128, 519 + * since we don't split into sync/async like the old code did. 520 + * Additionally, this is a per-hw queue depth. 520 521 */ 521 - q->nr_requests = 2 * BLKDEV_MAX_RQ; 522 + q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 523 + BLKDEV_MAX_RQ); 522 524 523 525 queue_for_each_hw_ctx(q, hctx, i) { 524 526 ret = blk_mq_sched_alloc_tags(q, hctx, i);

+1 -3

block/blk-mq.c

··· 1547 1547 1548 1548 blk_queue_split(q, &bio); 1549 1549 1550 - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1551 - bio_io_error(bio); 1550 + if (!bio_integrity_prep(bio)) 1552 1551 return BLK_QC_T_NONE; 1553 - } 1554 1552 1555 1553 if (!is_flush_fua && !blk_queue_nomerges(q) && 1556 1554 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))

+11

block/blk.h

··· 81 81 82 82 #ifdef CONFIG_BLK_DEV_INTEGRITY 83 83 void blk_flush_integrity(void); 84 + bool __bio_integrity_endio(struct bio *); 85 + static inline bool bio_integrity_endio(struct bio *bio) 86 + { 87 + if (bio_integrity(bio)) 88 + return __bio_integrity_endio(bio); 89 + return true; 90 + } 84 91 #else 85 92 static inline void blk_flush_integrity(void) 86 93 { 94 + } 95 + static inline bool bio_integrity_endio(struct bio *bio) 96 + { 97 + return true; 87 98 } 88 99 #endif 89 100

+3 -6

block/t10-pi.c

··· 28 28 29 29 typedef __be16 (csum_fn) (void *, unsigned int); 30 30 31 - static const __be16 APP_ESCAPE = (__force __be16) 0xffff; 32 - static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff; 33 - 34 31 static __be16 t10_pi_crc_fn(void *data, unsigned int len) 35 32 { 36 33 return cpu_to_be16(crc_t10dif(data, len)); ··· 79 82 switch (type) { 80 83 case 1: 81 84 case 2: 82 - if (pi->app_tag == APP_ESCAPE) 85 + if (pi->app_tag == T10_PI_APP_ESCAPE) 83 86 goto next; 84 87 85 88 if (be32_to_cpu(pi->ref_tag) != ··· 92 95 } 93 96 break; 94 97 case 3: 95 - if (pi->app_tag == APP_ESCAPE && 96 - pi->ref_tag == REF_ESCAPE) 98 + if (pi->app_tag == T10_PI_APP_ESCAPE && 99 + pi->ref_tag == T10_PI_REF_ESCAPE) 97 100 goto next; 98 101 break; 99 102 }

+8

drivers/block/cciss.c

··· 1944 1944 return; 1945 1945 } 1946 1946 1947 + static void cciss_initialize_rq(struct request *rq) 1948 + { 1949 + struct scsi_request *sreq = blk_mq_rq_to_pdu(rq); 1950 + 1951 + scsi_req_init(sreq); 1952 + } 1953 + 1947 1954 /* 1948 1955 * cciss_add_disk sets up the block device queue for a logical drive 1949 1956 */ ··· 1963 1956 1964 1957 disk->queue->cmd_size = sizeof(struct scsi_request); 1965 1958 disk->queue->request_fn = do_cciss_request; 1959 + disk->queue->initialize_rq_fn = cciss_initialize_rq; 1966 1960 disk->queue->queue_lock = &h->lock; 1967 1961 queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, disk->queue); 1968 1962 if (blk_init_allocated_queue(disk->queue) < 0)

+17 -6

drivers/block/mtip32xx/mtip32xx.c

··· 174 174 { 175 175 struct driver_data *dd = rq->q->queuedata; 176 176 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); 177 - u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; 178 177 179 178 /* Point the command headers at the command tables. */ 180 179 cmd->command_header = dd->port->command_list + ··· 181 182 cmd->command_header_dma = dd->port->command_list_dma + 182 183 (sizeof(struct mtip_cmd_hdr) * rq->tag); 183 184 184 - if (host_cap_64) 185 + if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags)) 185 186 cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16); 186 187 187 188 cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); ··· 385 386 port->mmio + PORT_LST_ADDR_HI); 386 387 writel((port->rxfis_dma >> 16) >> 16, 387 388 port->mmio + PORT_FIS_ADDR_HI); 389 + set_bit(MTIP_PF_HOST_CAP_64, &port->flags); 388 390 } 389 391 390 392 writel(port->command_list_dma & 0xFFFFFFFF, ··· 950 950 unsigned long to; 951 951 bool active = true; 952 952 953 - blk_mq_stop_hw_queues(port->dd->queue); 953 + blk_mq_quiesce_queue(port->dd->queue); 954 954 955 955 to = jiffies + msecs_to_jiffies(timeout); 956 956 do { ··· 970 970 break; 971 971 } while (time_before(jiffies, to)); 972 972 973 - blk_mq_start_stopped_hw_queues(port->dd->queue, true); 973 + blk_mq_unquiesce_queue(port->dd->queue); 974 974 return active ? -EBUSY : 0; 975 975 err_fault: 976 - blk_mq_start_stopped_hw_queues(port->dd->queue, true); 976 + blk_mq_unquiesce_queue(port->dd->queue); 977 977 return -EFAULT; 978 978 } 979 979 ··· 2737 2737 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); 2738 2738 struct driver_data *dd = data; 2739 2739 2740 + if (!blk_mq_request_started(req)) 2741 + return; 2742 + 2740 2743 dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag); 2741 2744 2742 2745 clear_bit(req->tag, dd->port->cmds_to_issue); ··· 2751 2748 bool reserved) 2752 2749 { 2753 2750 struct driver_data *dd = data; 2751 + 2752 + if (!blk_mq_request_started(req)) 2753 + return; 2754 2754 2755 2755 set_bit(req->tag, dd->port->cmds_to_issue); 2756 2756 blk_abort_request(req); ··· 2820 2814 dev_warn(&dd->pdev->dev, 2821 2815 "Completion workers still active!"); 2822 2816 2817 + blk_mq_quiesce_queue(dd->queue); 2818 + 2823 2819 spin_lock(dd->queue->queue_lock); 2824 2820 blk_mq_tagset_busy_iter(&dd->tags, 2825 2821 mtip_queue_cmd, dd); ··· 2834 2826 mtip_abort_cmd, dd); 2835 2827 2836 2828 clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags); 2829 + 2830 + blk_mq_unquiesce_queue(dd->queue); 2837 2831 } 2838 2832 2839 2833 if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { ··· 4005 3995 dd->disk->disk_name); 4006 3996 4007 3997 blk_freeze_queue_start(dd->queue); 4008 - blk_mq_stop_hw_queues(dd->queue); 3998 + blk_mq_quiesce_queue(dd->queue); 4009 3999 blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); 4000 + blk_mq_unquiesce_queue(dd->queue); 4010 4001 4011 4002 /* 4012 4003 * Delete our gendisk structure. This also removes the device

+1

drivers/block/mtip32xx/mtip32xx.h

··· 140 140 (1 << MTIP_PF_SE_ACTIVE_BIT) | 141 141 (1 << MTIP_PF_DM_ACTIVE_BIT) | 142 142 (1 << MTIP_PF_TO_ACTIVE_BIT)), 143 + MTIP_PF_HOST_CAP_64 = 10, /* cache HOST_CAP_64 */ 143 144 144 145 MTIP_PF_SVC_THD_ACTIVE_BIT = 4, 145 146 MTIP_PF_ISSUE_CMDS_BIT = 5,

+2 -2

drivers/block/nbd.c

··· 661 661 662 662 static void nbd_clear_que(struct nbd_device *nbd) 663 663 { 664 - blk_mq_stop_hw_queues(nbd->disk->queue); 664 + blk_mq_quiesce_queue(nbd->disk->queue); 665 665 blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); 666 - blk_mq_start_hw_queues(nbd->disk->queue); 666 + blk_mq_unquiesce_queue(nbd->disk->queue); 667 667 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); 668 668 } 669 669

+13 -5

drivers/block/null_blk.c

··· 844 844 queue_mode = NULL_Q_MQ; 845 845 } 846 846 847 - if (queue_mode == NULL_Q_MQ && shared_tags) 848 - null_init_tag_set(&tag_set); 849 - 850 847 if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { 851 848 if (submit_queues < nr_online_nodes) { 852 849 pr_warn("null_blk: submit_queues param is set to %u.", ··· 855 858 else if (!submit_queues) 856 859 submit_queues = 1; 857 860 861 + if (queue_mode == NULL_Q_MQ && shared_tags) { 862 + ret = null_init_tag_set(&tag_set); 863 + if (ret) 864 + return ret; 865 + } 866 + 858 867 mutex_init(&lock); 859 868 860 869 null_major = register_blkdev(0, "nullb"); 861 - if (null_major < 0) 862 - return null_major; 870 + if (null_major < 0) { 871 + ret = null_major; 872 + goto err_tagset; 873 + } 863 874 864 875 if (use_lightnvm) { 865 876 ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), ··· 896 891 kmem_cache_destroy(ppa_cache); 897 892 err_ppa: 898 893 unregister_blkdev(null_major, "nullb"); 894 + err_tagset: 895 + if (queue_mode == NULL_Q_MQ && shared_tags) 896 + blk_mq_free_tag_set(&tag_set); 899 897 return ret; 900 898 } 901 899

+2 -2

drivers/block/virtio_blk.c

··· 840 840 /* Make sure no work handler is accessing the device. */ 841 841 flush_work(&vblk->config_work); 842 842 843 - blk_mq_stop_hw_queues(vblk->disk->queue); 843 + blk_mq_quiesce_queue(vblk->disk->queue); 844 844 845 845 vdev->config->del_vqs(vdev); 846 846 return 0; ··· 857 857 858 858 virtio_device_ready(vdev); 859 859 860 - blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); 860 + blk_mq_unquiesce_queue(vblk->disk->queue); 861 861 return 0; 862 862 } 863 863 #endif

+48 -13

drivers/lightnvm/pblk-core.c

··· 1670 1670 queue_work(wq, &line_ws->ws); 1671 1671 } 1672 1672 1673 - void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, 1674 - unsigned long *lun_bitmap) 1673 + static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, 1674 + int nr_ppas, int pos) 1675 1675 { 1676 - struct nvm_tgt_dev *dev = pblk->dev; 1677 - struct nvm_geo *geo = &dev->geo; 1678 - struct pblk_lun *rlun; 1679 - int pos = pblk_ppa_to_pos(geo, ppa_list[0]); 1676 + struct pblk_lun *rlun = &pblk->luns[pos]; 1680 1677 int ret; 1681 1678 1682 1679 /* ··· 1687 1690 WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun || 1688 1691 ppa_list[0].g.ch != ppa_list[i].g.ch); 1689 1692 #endif 1690 - /* If the LUN has been locked for this same request, do no attempt to 1691 - * lock it again 1692 - */ 1693 - if (test_and_set_bit(pos, lun_bitmap)) 1694 - return; 1695 1693 1696 - rlun = &pblk->luns[pos]; 1697 - ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000)); 1694 + ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); 1698 1695 if (ret) { 1699 1696 switch (ret) { 1700 1697 case -ETIME: ··· 1699 1708 break; 1700 1709 } 1701 1710 } 1711 + } 1712 + 1713 + void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) 1714 + { 1715 + struct nvm_tgt_dev *dev = pblk->dev; 1716 + struct nvm_geo *geo = &dev->geo; 1717 + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); 1718 + 1719 + __pblk_down_page(pblk, ppa_list, nr_ppas, pos); 1720 + } 1721 + 1722 + void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, 1723 + unsigned long *lun_bitmap) 1724 + { 1725 + struct nvm_tgt_dev *dev = pblk->dev; 1726 + struct nvm_geo *geo = &dev->geo; 1727 + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); 1728 + 1729 + /* If the LUN has been locked for this same request, do no attempt to 1730 + * lock it again 1731 + */ 1732 + if (test_and_set_bit(pos, lun_bitmap)) 1733 + return; 1734 + 1735 + __pblk_down_page(pblk, ppa_list, nr_ppas, pos); 1736 + } 1737 + 1738 + void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) 1739 + { 1740 + struct nvm_tgt_dev *dev = pblk->dev; 1741 + struct nvm_geo *geo = &dev->geo; 1742 + struct pblk_lun *rlun; 1743 + int pos = pblk_ppa_to_pos(geo, ppa_list[0]); 1744 + 1745 + #ifdef CONFIG_NVM_DEBUG 1746 + int i; 1747 + 1748 + for (i = 1; i < nr_ppas; i++) 1749 + WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun || 1750 + ppa_list[0].g.ch != ppa_list[i].g.ch); 1751 + #endif 1752 + 1753 + rlun = &pblk->luns[pos]; 1754 + up(&rlun->wr_sem); 1702 1755 } 1703 1756 1704 1757 void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,

+21 -10

drivers/lightnvm/pblk-recovery.c

··· 340 340 struct pblk *pblk = pad_rq->pblk; 341 341 struct nvm_tgt_dev *dev = pblk->dev; 342 342 343 - kref_put(&pad_rq->ref, pblk_recov_complete); 343 + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 344 + 345 + bio_put(rqd->bio); 344 346 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); 345 347 pblk_free_rqd(pblk, rqd, WRITE); 348 + 349 + atomic_dec(&pblk->inflight_io); 350 + kref_put(&pad_rq->ref, pblk_recov_complete); 346 351 } 347 352 348 353 static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line, ··· 390 385 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); 391 386 if (rq_ppas < pblk->min_write_pgs) { 392 387 pr_err("pblk: corrupted pad line %d\n", line->id); 393 - goto free_rq; 388 + goto fail_free_pad; 394 389 } 395 390 396 391 rq_len = rq_ppas * geo->sec_size; ··· 398 393 meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); 399 394 if (!meta_list) { 400 395 ret = -ENOMEM; 401 - goto free_data; 396 + goto fail_free_pad; 402 397 } 403 398 404 399 ppa_list = (void *)(meta_list) + pblk_dma_meta_size; ··· 409 404 ret = PTR_ERR(rqd); 410 405 goto fail_free_meta; 411 406 } 412 - memset(rqd, 0, pblk_w_rq_size); 413 407 414 - bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); 408 + bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, 409 + PBLK_VMALLOC_META, GFP_KERNEL); 415 410 if (IS_ERR(bio)) { 416 411 ret = PTR_ERR(bio); 417 412 goto fail_free_rqd; ··· 458 453 } 459 454 460 455 kref_get(&pad_rq->ref); 456 + pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas); 461 457 462 458 ret = pblk_submit_io(pblk, rqd); 463 459 if (ret) { 464 460 pr_err("pblk: I/O submission failed: %d\n", ret); 465 - goto free_data; 461 + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 462 + goto fail_free_bio; 466 463 } 467 - 468 - atomic_dec(&pblk->inflight_io); 469 464 470 465 left_line_ppas -= rq_ppas; 471 466 left_ppas -= rq_ppas; ··· 480 475 ret = -ETIME; 481 476 } 482 477 478 + if (!pblk_line_is_full(line)) 479 + pr_err("pblk: corrupted padded line: %d\n", line->id); 480 + 481 + vfree(data); 483 482 free_rq: 484 483 kfree(pad_rq); 485 - free_data: 486 - vfree(data); 487 484 return ret; 488 485 486 + fail_free_bio: 487 + bio_put(bio); 489 488 fail_free_rqd: 490 489 pblk_free_rqd(pblk, rqd, WRITE); 491 490 fail_free_meta: 492 491 nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); 492 + fail_free_pad: 493 493 kfree(pad_rq); 494 + vfree(data); 494 495 return ret; 495 496 } 496 497

+8 -18

drivers/lightnvm/pblk-write.c

··· 39 39 40 40 ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); 41 41 42 - if (rqd->meta_list) 43 - nvm_dev_dma_free(dev->parent, rqd->meta_list, 44 - rqd->dma_meta_list); 42 + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); 45 43 46 44 bio_put(rqd->bio); 47 45 pblk_free_rqd(pblk, rqd, WRITE); ··· 176 178 { 177 179 struct pblk *pblk = rqd->private; 178 180 struct nvm_tgt_dev *dev = pblk->dev; 179 - struct nvm_geo *geo = &dev->geo; 180 181 struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); 181 182 struct pblk_line *line = m_ctx->private; 182 183 struct pblk_emeta *emeta = line->emeta; 183 - int pos = pblk_ppa_to_pos(geo, rqd->ppa_list[0]); 184 - struct pblk_lun *rlun = &pblk->luns[pos]; 185 184 int sync; 186 185 187 - up(&rlun->wr_sem); 186 + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 188 187 189 188 if (rqd->error) { 190 189 pblk_log_write_err(pblk, rqd); ··· 198 203 pblk->close_wq); 199 204 200 205 bio_put(rqd->bio); 206 + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); 201 207 pblk_free_rqd(pblk, rqd, READ); 202 208 203 209 atomic_dec(&pblk->inflight_io); ··· 221 225 &rqd->dma_meta_list); 222 226 if (!rqd->meta_list) 223 227 return -ENOMEM; 224 - 225 - if (unlikely(nr_secs == 1)) 226 - return 0; 227 228 228 229 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; 229 230 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; ··· 360 367 struct pblk_line_meta *lm = &pblk->lm; 361 368 struct pblk_emeta *emeta = meta_line->emeta; 362 369 struct pblk_g_ctx *m_ctx; 363 - struct pblk_lun *rlun; 364 370 struct bio *bio; 365 371 struct nvm_rq *rqd; 366 372 void *data; ··· 403 411 rqd->ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); 404 412 } 405 413 406 - rlun = &pblk->luns[pblk_ppa_to_pos(geo, rqd->ppa_list[0])]; 407 - ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000)); 408 - if (ret) { 409 - pr_err("pblk: lun semaphore timed out (%d)\n", ret); 410 - goto fail_free_bio; 411 - } 412 - 413 414 emeta->mem += rq_len; 414 415 if (emeta->mem >= lm->emeta_len[0]) { 415 416 spin_lock(&l_mg->close_lock); ··· 411 426 "pblk: corrupt meta line %d\n", meta_line->id); 412 427 spin_unlock(&l_mg->close_lock); 413 428 } 429 + 430 + pblk_down_page(pblk, rqd->ppa_list, rqd->nr_ppas); 414 431 415 432 ret = pblk_submit_io(pblk, rqd); 416 433 if (ret) { ··· 423 436 return NVM_IO_OK; 424 437 425 438 fail_rollback: 439 + pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 426 440 spin_lock(&l_mg->close_lock); 427 441 pblk_dealloc_page(pblk, meta_line, rq_ppas); 428 442 list_add(&meta_line->list, &meta_line->list); 429 443 spin_unlock(&l_mg->close_lock); 444 + 445 + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); 430 446 fail_free_bio: 431 447 if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) 432 448 bio_put(bio);

+2

drivers/lightnvm/pblk.h

··· 739 739 u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); 740 740 int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, 741 741 unsigned long secs_to_flush); 742 + void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); 742 743 void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, 743 744 unsigned long *lun_bitmap); 745 + void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); 744 746 void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, 745 747 unsigned long *lun_bitmap); 746 748 void pblk_end_bio_sync(struct bio *bio);

+1 -1

drivers/md/dm.c

··· 1279 1279 clone->bi_iter.bi_size = to_bytes(len); 1280 1280 1281 1281 if (unlikely(bio_integrity(bio) != NULL)) 1282 - bio_integrity_trim(clone, 0, len); 1282 + bio_integrity_trim(clone); 1283 1283 1284 1284 return 0; 1285 1285 }

+4 -12

drivers/nvdimm/blk.c

··· 106 106 107 107 len -= cur_len; 108 108 dev_offset += cur_len; 109 - bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); 109 + if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len)) 110 + return -EIO; 110 111 } 111 112 112 113 return err; ··· 180 179 int err = 0, rw; 181 180 bool do_acct; 182 181 183 - /* 184 - * bio_integrity_enabled also checks if the bio already has an 185 - * integrity payload attached. If it does, we *don't* do a 186 - * bio_integrity_prep here - the payload has been generated by 187 - * another kernel subsystem, and we just pass it through. 188 - */ 189 - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 190 - bio->bi_status = BLK_STS_IOERR; 191 - goto out; 192 - } 182 + if (!bio_integrity_prep(bio)) 183 + return BLK_QC_T_NONE; 193 184 194 185 bip = bio_integrity(bio); 195 186 nsblk = q->queuedata; ··· 205 212 if (do_acct) 206 213 nd_iostat_end(bio, start); 207 214 208 - out: 209 215 bio_endio(bio); 210 216 return BLK_QC_T_NONE; 211 217 }

+4 -12

drivers/nvdimm/btt.c

··· 985 985 986 986 len -= cur_len; 987 987 meta_nsoff += cur_len; 988 - bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); 988 + if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len)) 989 + return -EIO; 989 990 } 990 991 991 992 return ret; ··· 1204 1203 int err = 0; 1205 1204 bool do_acct; 1206 1205 1207 - /* 1208 - * bio_integrity_enabled also checks if the bio already has an 1209 - * integrity payload attached. If it does, we *don't* do a 1210 - * bio_integrity_prep here - the payload has been generated by 1211 - * another kernel subsystem, and we just pass it through. 1212 - */ 1213 - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1214 - bio->bi_status = BLK_STS_IOERR; 1215 - goto out; 1216 - } 1206 + if (!bio_integrity_prep(bio)) 1207 + return BLK_QC_T_NONE; 1217 1208 1218 1209 do_acct = nd_iostat_start(bio, &start); 1219 1210 bio_for_each_segment(bvec, bio, iter) { ··· 1232 1239 if (do_acct) 1233 1240 nd_iostat_end(bio, start); 1234 1241 1235 - out: 1236 1242 bio_endio(bio); 1237 1243 return BLK_QC_T_NONE; 1238 1244 }

+21 -19

drivers/nvme/host/core.c

··· 131 131 { 132 132 if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { 133 133 nvme_req(req)->retries++; 134 - blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q)); 134 + blk_mq_requeue_request(req, true); 135 135 return; 136 136 } 137 137 ··· 2591 2591 spin_unlock(&dev_list_lock); 2592 2592 } 2593 2593 2594 - void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 2594 + void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 2595 2595 { 2596 + nvme_stop_keep_alive(ctrl); 2596 2597 flush_work(&ctrl->async_event_work); 2597 2598 flush_work(&ctrl->scan_work); 2598 - nvme_remove_namespaces(ctrl); 2599 + } 2600 + EXPORT_SYMBOL_GPL(nvme_stop_ctrl); 2599 2601 2602 + void nvme_start_ctrl(struct nvme_ctrl *ctrl) 2603 + { 2604 + if (ctrl->kato) 2605 + nvme_start_keep_alive(ctrl); 2606 + 2607 + if (ctrl->queue_count > 1) { 2608 + nvme_queue_scan(ctrl); 2609 + nvme_queue_async_events(ctrl); 2610 + nvme_start_queues(ctrl); 2611 + } 2612 + } 2613 + EXPORT_SYMBOL_GPL(nvme_start_ctrl); 2614 + 2615 + void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 2616 + { 2600 2617 device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); 2601 2618 2602 2619 spin_lock(&dev_list_lock); ··· 2711 2694 /* Forcibly unquiesce queues to avoid blocking dispatch */ 2712 2695 blk_mq_unquiesce_queue(ctrl->admin_q); 2713 2696 2714 - /* Forcibly start all queues to avoid having stuck requests */ 2715 - blk_mq_start_hw_queues(ctrl->admin_q); 2716 - 2717 2697 list_for_each_entry(ns, &ctrl->namespaces, list) { 2718 2698 /* 2719 2699 * Revalidating a dead namespace sets capacity to 0. This will ··· 2723 2709 2724 2710 /* Forcibly unquiesce queues to avoid blocking dispatch */ 2725 2711 blk_mq_unquiesce_queue(ns->queue); 2726 - 2727 - /* 2728 - * Forcibly start all queues to avoid having stuck requests. 2729 - * Note that we must ensure the queues are not stopped 2730 - * when the final removal happens. 2731 - */ 2732 - blk_mq_start_hw_queues(ns->queue); 2733 - 2734 - /* draining requests in requeue list */ 2735 - blk_mq_kick_requeue_list(ns->queue); 2736 2712 } 2737 2713 mutex_unlock(&ctrl->namespaces_mutex); 2738 2714 } ··· 2791 2787 struct nvme_ns *ns; 2792 2788 2793 2789 mutex_lock(&ctrl->namespaces_mutex); 2794 - list_for_each_entry(ns, &ctrl->namespaces, list) { 2790 + list_for_each_entry(ns, &ctrl->namespaces, list) 2795 2791 blk_mq_unquiesce_queue(ns->queue); 2796 - blk_mq_kick_requeue_list(ns->queue); 2797 - } 2798 2792 mutex_unlock(&ctrl->namespaces_mutex); 2799 2793 } 2800 2794 EXPORT_SYMBOL_GPL(nvme_start_queues);

+40 -43

drivers/nvme/host/fc.c

··· 148 148 struct device *dev; 149 149 struct nvme_fc_lport *lport; 150 150 struct nvme_fc_rport *rport; 151 - u32 queue_count; 152 151 u32 cnum; 153 152 154 153 u64 association_id; 155 - 156 - u64 cap; 157 154 158 155 struct list_head ctrl_list; /* rport->ctrl_list */ 159 156 ··· 1611 1614 { 1612 1615 int i; 1613 1616 1614 - for (i = 1; i < ctrl->queue_count; i++) 1617 + for (i = 1; i < ctrl->ctrl.queue_count; i++) 1615 1618 nvme_fc_free_queue(&ctrl->queues[i]); 1616 1619 } 1617 1620 ··· 1632 1635 static void 1633 1636 nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl) 1634 1637 { 1635 - struct nvme_fc_queue *queue = &ctrl->queues[ctrl->queue_count - 1]; 1638 + struct nvme_fc_queue *queue = &ctrl->queues[ctrl->ctrl.queue_count - 1]; 1636 1639 int i; 1637 1640 1638 - for (i = ctrl->queue_count - 1; i >= 1; i--, queue--) 1641 + for (i = ctrl->ctrl.queue_count - 1; i >= 1; i--, queue--) 1639 1642 __nvme_fc_delete_hw_queue(ctrl, queue, i); 1640 1643 } 1641 1644 ··· 1645 1648 struct nvme_fc_queue *queue = &ctrl->queues[1]; 1646 1649 int i, ret; 1647 1650 1648 - for (i = 1; i < ctrl->queue_count; i++, queue++) { 1651 + for (i = 1; i < ctrl->ctrl.queue_count; i++, queue++) { 1649 1652 ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize); 1650 1653 if (ret) 1651 1654 goto delete_queues; ··· 1664 1667 { 1665 1668 int i, ret = 0; 1666 1669 1667 - for (i = 1; i < ctrl->queue_count; i++) { 1670 + for (i = 1; i < ctrl->ctrl.queue_count; i++) { 1668 1671 ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize, 1669 1672 (qsize / 5)); 1670 1673 if (ret) ··· 1682 1685 { 1683 1686 int i; 1684 1687 1685 - for (i = 1; i < ctrl->queue_count; i++) 1688 + for (i = 1; i < ctrl->ctrl.queue_count; i++) 1686 1689 nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); 1687 1690 } 1688 1691 ··· 1703 1706 list_del(&ctrl->ctrl_list); 1704 1707 spin_unlock_irqrestore(&ctrl->rport->lock, flags); 1705 1708 1709 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 1706 1710 blk_cleanup_queue(ctrl->ctrl.admin_q); 1707 1711 blk_mq_free_tag_set(&ctrl->admin_tag_set); 1708 1712 ··· 1967 1969 if (ret != -EBUSY) 1968 1970 return BLK_STS_IOERR; 1969 1971 1970 - if (op->rq) { 1971 - blk_mq_stop_hw_queues(op->rq->q); 1972 - blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY); 1973 - } 1972 + if (op->rq) 1973 + blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); 1974 + 1974 1975 return BLK_STS_RESOURCE; 1975 1976 } 1976 1977 ··· 2175 2178 nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) 2176 2179 { 2177 2180 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2181 + unsigned int nr_io_queues; 2178 2182 int ret; 2179 2183 2180 - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); 2184 + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), 2185 + ctrl->lport->ops->max_hw_queues); 2186 + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); 2181 2187 if (ret) { 2182 2188 dev_info(ctrl->ctrl.device, 2183 2189 "set_queue_count failed: %d\n", ret); 2184 2190 return ret; 2185 2191 } 2186 2192 2187 - ctrl->queue_count = opts->nr_io_queues + 1; 2188 - if (!opts->nr_io_queues) 2193 + ctrl->ctrl.queue_count = nr_io_queues + 1; 2194 + if (!nr_io_queues) 2189 2195 return 0; 2190 2196 2191 2197 nvme_fc_init_io_queues(ctrl); ··· 2204 2204 sizeof(struct scatterlist)) + 2205 2205 ctrl->lport->ops->fcprqst_priv_sz; 2206 2206 ctrl->tag_set.driver_data = ctrl; 2207 - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; 2207 + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; 2208 2208 ctrl->tag_set.timeout = NVME_IO_TIMEOUT; 2209 2209 2210 2210 ret = blk_mq_alloc_tag_set(&ctrl->tag_set); ··· 2232 2232 out_delete_hw_queues: 2233 2233 nvme_fc_delete_hw_io_queues(ctrl); 2234 2234 out_cleanup_blk_queue: 2235 - nvme_stop_keep_alive(&ctrl->ctrl); 2236 2235 blk_cleanup_queue(ctrl->ctrl.connect_q); 2237 2236 out_free_tag_set: 2238 2237 blk_mq_free_tag_set(&ctrl->tag_set); ··· 2247 2248 nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) 2248 2249 { 2249 2250 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2251 + unsigned int nr_io_queues; 2250 2252 int ret; 2251 2253 2252 - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); 2254 + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), 2255 + ctrl->lport->ops->max_hw_queues); 2256 + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); 2253 2257 if (ret) { 2254 2258 dev_info(ctrl->ctrl.device, 2255 2259 "set_queue_count failed: %d\n", ret); 2256 2260 return ret; 2257 2261 } 2258 2262 2263 + ctrl->ctrl.queue_count = nr_io_queues + 1; 2259 2264 /* check for io queues existing */ 2260 - if (ctrl->queue_count == 1) 2265 + if (ctrl->ctrl.queue_count == 1) 2261 2266 return 0; 2262 2267 2263 2268 nvme_fc_init_io_queues(ctrl); ··· 2277 2274 ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size); 2278 2275 if (ret) 2279 2276 goto out_delete_hw_queues; 2277 + 2278 + blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); 2280 2279 2281 2280 return 0; 2282 2281 ··· 2321 2316 goto out_delete_hw_queue; 2322 2317 2323 2318 if (ctrl->ctrl.state != NVME_CTRL_NEW) 2324 - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); 2319 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 2325 2320 2326 2321 ret = nvmf_connect_admin_queue(&ctrl->ctrl); 2327 2322 if (ret) ··· 2334 2329 * prior connection values 2335 2330 */ 2336 2331 2337 - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); 2332 + ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); 2338 2333 if (ret) { 2339 2334 dev_err(ctrl->ctrl.device, 2340 2335 "prop_get NVME_REG_CAP failed\n"); ··· 2342 2337 } 2343 2338 2344 2339 ctrl->ctrl.sqsize = 2345 - min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); 2340 + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap) + 1, ctrl->ctrl.sqsize); 2346 2341 2347 - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); 2342 + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 2348 2343 if (ret) 2349 2344 goto out_disconnect_admin_queue; 2350 2345 ··· 2365 2360 goto out_disconnect_admin_queue; 2366 2361 } 2367 2362 2368 - nvme_start_keep_alive(&ctrl->ctrl); 2369 - 2370 2363 /* FC-NVME supports normal SGL Data Block Descriptors */ 2371 2364 2372 2365 if (opts->queue_size > ctrl->ctrl.maxcmd) { ··· 2384 2381 * Create the io queues 2385 2382 */ 2386 2383 2387 - if (ctrl->queue_count > 1) { 2384 + if (ctrl->ctrl.queue_count > 1) { 2388 2385 if (ctrl->ctrl.state == NVME_CTRL_NEW) 2389 2386 ret = nvme_fc_create_io_queues(ctrl); 2390 2387 else ··· 2398 2395 2399 2396 ctrl->ctrl.nr_reconnects = 0; 2400 2397 2401 - if (ctrl->queue_count > 1) { 2402 - nvme_start_queues(&ctrl->ctrl); 2403 - nvme_queue_scan(&ctrl->ctrl); 2404 - nvme_queue_async_events(&ctrl->ctrl); 2405 - } 2398 + nvme_start_ctrl(&ctrl->ctrl); 2406 2399 2407 2400 return 0; /* Success */ 2408 2401 2409 2402 out_term_aen_ops: 2410 2403 nvme_fc_term_aen_ops(ctrl); 2411 - nvme_stop_keep_alive(&ctrl->ctrl); 2412 2404 out_disconnect_admin_queue: 2413 2405 /* send a Disconnect(association) LS to fc-nvme target */ 2414 2406 nvme_fc_xmt_disconnect_assoc(ctrl); ··· 2426 2428 { 2427 2429 unsigned long flags; 2428 2430 2429 - nvme_stop_keep_alive(&ctrl->ctrl); 2430 - 2431 2431 spin_lock_irqsave(&ctrl->lock, flags); 2432 2432 ctrl->flags |= FCCTRL_TERMIO; 2433 2433 ctrl->iocnt = 0; ··· 2443 2447 * io requests back to the block layer as part of normal completions 2444 2448 * (but with error status). 2445 2449 */ 2446 - if (ctrl->queue_count > 1) { 2450 + if (ctrl->ctrl.queue_count > 1) { 2447 2451 nvme_stop_queues(&ctrl->ctrl); 2448 2452 blk_mq_tagset_busy_iter(&ctrl->tag_set, 2449 2453 nvme_fc_terminate_exchange, &ctrl->ctrl); ··· 2466 2470 * use blk_mq_tagset_busy_itr() and the transport routine to 2467 2471 * terminate the exchanges. 2468 2472 */ 2469 - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); 2473 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 2470 2474 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 2471 2475 nvme_fc_terminate_exchange, &ctrl->ctrl); 2472 2476 ··· 2507 2511 2508 2512 cancel_work_sync(&ctrl->ctrl.reset_work); 2509 2513 cancel_delayed_work_sync(&ctrl->connect_work); 2510 - 2514 + nvme_stop_ctrl(&ctrl->ctrl); 2515 + nvme_remove_namespaces(&ctrl->ctrl); 2511 2516 /* 2512 2517 * kill the association on the link side. this will block 2513 2518 * waiting for io to terminate ··· 2603 2606 container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); 2604 2607 int ret; 2605 2608 2609 + nvme_stop_ctrl(&ctrl->ctrl); 2606 2610 /* will block will waiting for io to terminate */ 2607 2611 nvme_fc_delete_association(ctrl); 2608 2612 ··· 2700 2702 spin_lock_init(&ctrl->lock); 2701 2703 2702 2704 /* io queue count */ 2703 - ctrl->queue_count = min_t(unsigned int, 2705 + ctrl->ctrl.queue_count = min_t(unsigned int, 2704 2706 opts->nr_io_queues, 2705 2707 lport->ops->max_hw_queues); 2706 - opts->nr_io_queues = ctrl->queue_count; /* so opts has valid value */ 2707 - ctrl->queue_count++; /* +1 for admin queue */ 2708 + ctrl->ctrl.queue_count++; /* +1 for admin queue */ 2708 2709 2709 2710 ctrl->ctrl.sqsize = opts->queue_size - 1; 2710 2711 ctrl->ctrl.kato = opts->kato; 2711 2712 2712 2713 ret = -ENOMEM; 2713 - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue), 2714 - GFP_KERNEL); 2714 + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, 2715 + sizeof(struct nvme_fc_queue), GFP_KERNEL); 2715 2716 if (!ctrl->queues) 2716 2717 goto out_free_ida; 2717 2718

+4

drivers/nvme/host/nvme.h

··· 142 142 u16 cntlid; 143 143 144 144 u32 ctrl_config; 145 + u32 queue_count; 145 146 147 + u64 cap; 146 148 u32 page_size; 147 149 u32 max_hw_sectors; 148 150 u16 oncs; ··· 280 278 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, 281 279 const struct nvme_ctrl_ops *ops, unsigned long quirks); 282 280 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); 281 + void nvme_start_ctrl(struct nvme_ctrl *ctrl); 282 + void nvme_stop_ctrl(struct nvme_ctrl *ctrl); 283 283 void nvme_put_ctrl(struct nvme_ctrl *ctrl); 284 284 int nvme_init_identify(struct nvme_ctrl *ctrl); 285 285

+58 -38

drivers/nvme/host/pci.c

··· 35 35 36 36 #include "nvme.h" 37 37 38 - #define NVME_Q_DEPTH 1024 39 38 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 40 39 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) 41 40 ··· 56 57 MODULE_PARM_DESC(max_host_mem_size_mb, 57 58 "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); 58 59 60 + static int io_queue_depth_set(const char *val, const struct kernel_param *kp); 61 + static const struct kernel_param_ops io_queue_depth_ops = { 62 + .set = io_queue_depth_set, 63 + .get = param_get_int, 64 + }; 65 + 66 + static int io_queue_depth = 1024; 67 + module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); 68 + MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); 69 + 59 70 struct nvme_dev; 60 71 struct nvme_queue; 61 72 ··· 83 74 struct device *dev; 84 75 struct dma_pool *prp_page_pool; 85 76 struct dma_pool *prp_small_pool; 86 - unsigned queue_count; 87 77 unsigned online_queues; 88 78 unsigned max_qid; 89 79 int q_depth; ··· 112 104 struct nvme_host_mem_buf_desc *host_mem_descs; 113 105 void **host_mem_desc_bufs; 114 106 }; 107 + 108 + static int io_queue_depth_set(const char *val, const struct kernel_param *kp) 109 + { 110 + int n = 0, ret; 111 + 112 + ret = kstrtoint(val, 10, &n); 113 + if (ret != 0 || n < 2) 114 + return -EINVAL; 115 + 116 + return param_set_int(val, kp); 117 + } 115 118 116 119 static inline unsigned int sq_idx(unsigned int qid, u32 stride) 117 120 { ··· 1118 1099 { 1119 1100 int i; 1120 1101 1121 - for (i = dev->queue_count - 1; i >= lowest; i--) { 1102 + for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { 1122 1103 struct nvme_queue *nvmeq = dev->queues[i]; 1123 - dev->queue_count--; 1104 + dev->ctrl.queue_count--; 1124 1105 dev->queues[i] = NULL; 1125 1106 nvme_free_queue(nvmeq); 1126 1107 } ··· 1145 1126 spin_unlock_irq(&nvmeq->q_lock); 1146 1127 1147 1128 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1148 - blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); 1129 + blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); 1149 1130 1150 1131 pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); 1151 1132 ··· 1164 1145 if (shutdown) 1165 1146 nvme_shutdown_ctrl(&dev->ctrl); 1166 1147 else 1167 - nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( 1168 - dev->bar + NVME_REG_CAP)); 1148 + nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1169 1149 1170 1150 spin_lock_irq(&nvmeq->q_lock); 1171 1151 nvme_process_cq(nvmeq); ··· 1239 1221 nvmeq->qid = qid; 1240 1222 nvmeq->cq_vector = -1; 1241 1223 dev->queues[qid] = nvmeq; 1242 - dev->queue_count++; 1224 + dev->ctrl.queue_count++; 1243 1225 1244 1226 return nvmeq; 1245 1227 ··· 1335 1317 * user requests may be waiting on a stopped queue. Start the 1336 1318 * queue to flush these to completion. 1337 1319 */ 1338 - blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); 1320 + blk_mq_unquiesce_queue(dev->ctrl.admin_q); 1339 1321 blk_cleanup_queue(dev->ctrl.admin_q); 1340 1322 blk_mq_free_tag_set(&dev->admin_tagset); 1341 1323 } ··· 1372 1354 return -ENODEV; 1373 1355 } 1374 1356 } else 1375 - blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); 1357 + blk_mq_unquiesce_queue(dev->ctrl.admin_q); 1376 1358 1377 1359 return 0; 1378 1360 } ··· 1403 1385 return 0; 1404 1386 } 1405 1387 1406 - static int nvme_configure_admin_queue(struct nvme_dev *dev) 1388 + static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) 1407 1389 { 1408 1390 int result; 1409 1391 u32 aqa; 1410 - u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1411 1392 struct nvme_queue *nvmeq; 1412 1393 1413 1394 result = nvme_remap_bar(dev, db_bar_size(dev, 0)); ··· 1414 1397 return result; 1415 1398 1416 1399 dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? 1417 - NVME_CAP_NSSRC(cap) : 0; 1400 + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; 1418 1401 1419 1402 if (dev->subsystem && 1420 1403 (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) 1421 1404 writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); 1422 1405 1423 - result = nvme_disable_ctrl(&dev->ctrl, cap); 1406 + result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1424 1407 if (result < 0) 1425 1408 return result; 1426 1409 ··· 1439 1422 lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); 1440 1423 lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); 1441 1424 1442 - result = nvme_enable_ctrl(&dev->ctrl, cap); 1425 + result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); 1443 1426 if (result) 1444 1427 return result; 1445 1428 ··· 1458 1441 unsigned i, max; 1459 1442 int ret = 0; 1460 1443 1461 - for (i = dev->queue_count; i <= dev->max_qid; i++) { 1444 + for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { 1462 1445 /* vector == qid - 1, match nvme_create_queue */ 1463 1446 if (!nvme_alloc_queue(dev, i, dev->q_depth, 1464 1447 pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ··· 1467 1450 } 1468 1451 } 1469 1452 1470 - max = min(dev->max_qid, dev->queue_count - 1); 1453 + max = min(dev->max_qid, dev->ctrl.queue_count - 1); 1471 1454 for (i = dev->online_queues; i <= max; i++) { 1472 1455 ret = nvme_create_queue(dev->queues[i], i); 1473 1456 if (ret) ··· 1602 1585 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) 1603 1586 { 1604 1587 struct nvme_host_mem_buf_desc *descs; 1605 - u32 chunk_size, max_entries, i = 0; 1588 + u32 chunk_size, max_entries; 1589 + int i = 0; 1606 1590 void **bufs; 1607 - u64 size, tmp; 1591 + u64 size = 0, tmp; 1608 1592 1609 1593 /* start big and work our way down */ 1610 1594 chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); ··· 1884 1866 1885 1867 static int nvme_pci_enable(struct nvme_dev *dev) 1886 1868 { 1887 - u64 cap; 1888 1869 int result = -ENOMEM; 1889 1870 struct pci_dev *pdev = to_pci_dev(dev->dev); 1890 1871 ··· 1910 1893 if (result < 0) 1911 1894 return result; 1912 1895 1913 - cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1896 + dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1914 1897 1915 - dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); 1916 - dev->db_stride = 1 << NVME_CAP_STRIDE(cap); 1898 + dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, 1899 + io_queue_depth); 1900 + dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); 1917 1901 dev->dbs = dev->bar + 4096; 1918 1902 1919 1903 /* ··· 1926 1908 dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " 1927 1909 "set queue depth=%u to work around controller resets\n", 1928 1910 dev->q_depth); 1911 + } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && 1912 + (pdev->device == 0xa821 || pdev->device == 0xa822) && 1913 + NVME_CAP_MQES(dev->ctrl.cap) == 0) { 1914 + dev->q_depth = 64; 1915 + dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " 1916 + "set queue depth=%u\n", dev->q_depth); 1929 1917 } 1930 1918 1931 1919 /* ··· 2020 1996 nvme_stop_queues(&dev->ctrl); 2021 1997 2022 1998 queues = dev->online_queues - 1; 2023 - for (i = dev->queue_count - 1; i > 0; i--) 1999 + for (i = dev->ctrl.queue_count - 1; i > 0; i--) 2024 2000 nvme_suspend_queue(dev->queues[i]); 2025 2001 2026 2002 if (dead) { ··· 2028 2004 * probe, before the admin queue is configured. Thus, 2029 2005 * queue_count can be 0 here. 2030 2006 */ 2031 - if (dev->queue_count) 2007 + if (dev->ctrl.queue_count) 2032 2008 nvme_suspend_queue(dev->queues[0]); 2033 2009 } else { 2034 2010 nvme_disable_io_queues(dev, queues); ··· 2118 2094 if (result) 2119 2095 goto out; 2120 2096 2121 - result = nvme_configure_admin_queue(dev); 2097 + result = nvme_pci_configure_admin_queue(dev); 2122 2098 if (result) 2123 2099 goto out; 2124 2100 ··· 2157 2133 goto out; 2158 2134 2159 2135 /* 2160 - * A controller that can not execute IO typically requires user 2161 - * intervention to correct. For such degraded controllers, the driver 2162 - * should not submit commands the user did not request, so skip 2163 - * registering for asynchronous event notification on this condition. 2164 - */ 2165 - if (dev->online_queues > 1) 2166 - nvme_queue_async_events(&dev->ctrl); 2167 - 2168 - /* 2169 2136 * Keep the controller around but remove all namespaces if we don't have 2170 2137 * any working I/O queue. 2171 2138 */ ··· 2176 2161 goto out; 2177 2162 } 2178 2163 2179 - if (dev->online_queues > 1) 2180 - nvme_queue_scan(&dev->ctrl); 2164 + nvme_start_ctrl(&dev->ctrl); 2181 2165 return; 2182 2166 2183 2167 out: ··· 2355 2341 } 2356 2342 2357 2343 flush_work(&dev->ctrl.reset_work); 2358 - nvme_uninit_ctrl(&dev->ctrl); 2344 + nvme_stop_ctrl(&dev->ctrl); 2345 + nvme_remove_namespaces(&dev->ctrl); 2359 2346 nvme_dev_disable(dev, true); 2360 2347 nvme_free_host_mem(dev); 2361 2348 nvme_dev_remove_admin(dev); 2362 2349 nvme_free_queues(dev, 0); 2350 + nvme_uninit_ctrl(&dev->ctrl); 2363 2351 nvme_release_prp_pools(dev); 2364 2352 nvme_dev_unmap(dev); 2365 2353 nvme_put_ctrl(&dev->ctrl); ··· 2473 2457 { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ 2474 2458 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2475 2459 { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ 2460 + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2461 + { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ 2462 + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2463 + { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ 2476 2464 .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, 2477 2465 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, 2478 2466 { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },

+49 -59

drivers/nvme/host/rdma.c

··· 86 86 87 87 struct nvme_rdma_queue { 88 88 struct nvme_rdma_qe *rsp_ring; 89 - u8 sig_count; 89 + atomic_t sig_count; 90 90 int queue_size; 91 91 size_t cmnd_capsule_len; 92 92 struct nvme_rdma_ctrl *ctrl; ··· 103 103 struct nvme_rdma_ctrl { 104 104 /* read only in the hot path */ 105 105 struct nvme_rdma_queue *queues; 106 - u32 queue_count; 107 106 108 107 /* other member variables */ 109 108 struct blk_mq_tag_set tag_set; ··· 118 119 struct blk_mq_tag_set admin_tag_set; 119 120 struct nvme_rdma_device *device; 120 121 121 - u64 cap; 122 122 u32 max_fr_pages; 123 123 124 124 struct sockaddr_storage addr; ··· 272 274 struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 273 275 int ret = 0; 274 276 275 - if (!req->mr->need_inval) 276 - goto out; 277 - 278 277 ib_dereg_mr(req->mr); 279 278 280 279 req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, ··· 344 349 struct nvme_rdma_ctrl *ctrl = data; 345 350 struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; 346 351 347 - BUG_ON(hctx_idx >= ctrl->queue_count); 352 + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); 348 353 349 354 hctx->driver_data = queue; 350 355 return 0; ··· 520 525 queue->cmnd_capsule_len = sizeof(struct nvme_command); 521 526 522 527 queue->queue_size = queue_size; 528 + atomic_set(&queue->sig_count, 0); 523 529 524 530 queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, 525 531 RDMA_PS_TCP, IB_QPT_RC); ··· 583 587 { 584 588 int i; 585 589 586 - for (i = 1; i < ctrl->queue_count; i++) 590 + for (i = 1; i < ctrl->ctrl.queue_count; i++) 587 591 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); 588 592 } 589 593 ··· 591 595 { 592 596 int i, ret = 0; 593 597 594 - for (i = 1; i < ctrl->queue_count; i++) { 598 + for (i = 1; i < ctrl->ctrl.queue_count; i++) { 595 599 ret = nvmf_connect_io_queue(&ctrl->ctrl, i); 596 600 if (ret) { 597 601 dev_info(ctrl->ctrl.device, ··· 619 623 if (ret) 620 624 return ret; 621 625 622 - ctrl->queue_count = nr_io_queues + 1; 623 - if (ctrl->queue_count < 2) 626 + ctrl->ctrl.queue_count = nr_io_queues + 1; 627 + if (ctrl->ctrl.queue_count < 2) 624 628 return 0; 625 629 626 630 dev_info(ctrl->ctrl.device, 627 631 "creating %d I/O queues.\n", nr_io_queues); 628 632 629 - for (i = 1; i < ctrl->queue_count; i++) { 633 + for (i = 1; i < ctrl->ctrl.queue_count; i++) { 630 634 ret = nvme_rdma_init_queue(ctrl, i, 631 635 ctrl->ctrl.opts->queue_size); 632 636 if (ret) { ··· 701 705 702 706 ++ctrl->ctrl.nr_reconnects; 703 707 704 - if (ctrl->queue_count > 1) { 708 + if (ctrl->ctrl.queue_count > 1) { 705 709 nvme_rdma_free_io_queues(ctrl); 706 710 707 711 ret = blk_mq_reinit_tagset(&ctrl->tag_set); ··· 725 729 726 730 set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); 727 731 728 - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); 732 + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 729 733 if (ret) 730 734 goto requeue; 731 735 732 - nvme_start_keep_alive(&ctrl->ctrl); 733 - 734 - if (ctrl->queue_count > 1) { 736 + if (ctrl->ctrl.queue_count > 1) { 735 737 ret = nvme_rdma_init_io_queues(ctrl); 736 738 if (ret) 737 739 goto requeue; ··· 737 743 ret = nvme_rdma_connect_io_queues(ctrl); 738 744 if (ret) 739 745 goto requeue; 746 + 747 + blk_mq_update_nr_hw_queues(&ctrl->tag_set, 748 + ctrl->ctrl.queue_count - 1); 740 749 } 741 750 742 751 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 743 752 WARN_ON_ONCE(!changed); 744 753 ctrl->ctrl.nr_reconnects = 0; 745 754 746 - if (ctrl->queue_count > 1) { 747 - nvme_queue_scan(&ctrl->ctrl); 748 - nvme_queue_async_events(&ctrl->ctrl); 749 - } 755 + nvme_start_ctrl(&ctrl->ctrl); 750 756 751 757 dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); 752 758 ··· 764 770 struct nvme_rdma_ctrl, err_work); 765 771 int i; 766 772 767 - nvme_stop_keep_alive(&ctrl->ctrl); 773 + nvme_stop_ctrl(&ctrl->ctrl); 768 774 769 - for (i = 0; i < ctrl->queue_count; i++) 775 + for (i = 0; i < ctrl->ctrl.queue_count; i++) 770 776 clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); 771 777 772 - if (ctrl->queue_count > 1) 778 + if (ctrl->ctrl.queue_count > 1) 773 779 nvme_stop_queues(&ctrl->ctrl); 774 - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); 780 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 775 781 776 782 /* We must take care of fastfail/requeue all our inflight requests */ 777 - if (ctrl->queue_count > 1) 783 + if (ctrl->ctrl.queue_count > 1) 778 784 blk_mq_tagset_busy_iter(&ctrl->tag_set, 779 785 nvme_cancel_request, &ctrl->ctrl); 780 786 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, ··· 784 790 * queues are not a live anymore, so restart the queues to fail fast 785 791 * new IO 786 792 */ 787 - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); 793 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 788 794 nvme_start_queues(&ctrl->ctrl); 789 795 790 796 nvme_rdma_reconnect_or_remove(ctrl); ··· 1002 1008 nvme_rdma_wr_error(cq, wc, "SEND"); 1003 1009 } 1004 1010 1005 - static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) 1011 + /* 1012 + * We want to signal completion at least every queue depth/2. This returns the 1013 + * largest power of two that is not above half of (queue size + 1) to optimize 1014 + * (avoid divisions). 1015 + */ 1016 + static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) 1006 1017 { 1007 - int sig_limit; 1018 + int limit = 1 << ilog2((queue->queue_size + 1) / 2); 1008 1019 1009 - /* 1010 - * We signal completion every queue depth/2 and also handle the 1011 - * degenerated case of a device with queue_depth=1, where we 1012 - * would need to signal every message. 1013 - */ 1014 - sig_limit = max(queue->queue_size / 2, 1); 1015 - return (++queue->sig_count % sig_limit) == 0; 1020 + return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; 1016 1021 } 1017 1022 1018 1023 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, ··· 1567 1574 1568 1575 set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); 1569 1576 1570 - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); 1577 + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, 1578 + &ctrl->ctrl.cap); 1571 1579 if (error) { 1572 1580 dev_err(ctrl->ctrl.device, 1573 1581 "prop_get NVME_REG_CAP failed\n"); ··· 1576 1582 } 1577 1583 1578 1584 ctrl->ctrl.sqsize = 1579 - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); 1585 + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); 1580 1586 1581 - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); 1587 + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 1582 1588 if (error) 1583 1589 goto out_cleanup_queue; 1584 1590 ··· 1594 1600 DMA_TO_DEVICE); 1595 1601 if (error) 1596 1602 goto out_cleanup_queue; 1597 - 1598 - nvme_start_keep_alive(&ctrl->ctrl); 1599 1603 1600 1604 return 0; 1601 1605 ··· 1612 1620 1613 1621 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) 1614 1622 { 1615 - nvme_stop_keep_alive(&ctrl->ctrl); 1616 1623 cancel_work_sync(&ctrl->err_work); 1617 1624 cancel_delayed_work_sync(&ctrl->reconnect_work); 1618 1625 1619 - if (ctrl->queue_count > 1) { 1626 + if (ctrl->ctrl.queue_count > 1) { 1620 1627 nvme_stop_queues(&ctrl->ctrl); 1621 1628 blk_mq_tagset_busy_iter(&ctrl->tag_set, 1622 1629 nvme_cancel_request, &ctrl->ctrl); ··· 1625 1634 if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) 1626 1635 nvme_shutdown_ctrl(&ctrl->ctrl); 1627 1636 1628 - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); 1637 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 1629 1638 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 1630 1639 nvme_cancel_request, &ctrl->ctrl); 1640 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 1631 1641 nvme_rdma_destroy_admin_queue(ctrl); 1632 1642 } 1633 1643 1634 1644 static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) 1635 1645 { 1636 - nvme_uninit_ctrl(&ctrl->ctrl); 1646 + nvme_stop_ctrl(&ctrl->ctrl); 1647 + nvme_remove_namespaces(&ctrl->ctrl); 1637 1648 if (shutdown) 1638 1649 nvme_rdma_shutdown_ctrl(ctrl); 1639 1650 1651 + nvme_uninit_ctrl(&ctrl->ctrl); 1640 1652 if (ctrl->ctrl.tagset) { 1641 1653 blk_cleanup_queue(ctrl->ctrl.connect_q); 1642 1654 blk_mq_free_tag_set(&ctrl->tag_set); ··· 1701 1707 int ret; 1702 1708 bool changed; 1703 1709 1710 + nvme_stop_ctrl(&ctrl->ctrl); 1704 1711 nvme_rdma_shutdown_ctrl(ctrl); 1705 1712 1706 1713 ret = nvme_rdma_configure_admin_queue(ctrl); ··· 1711 1716 goto del_dead_ctrl; 1712 1717 } 1713 1718 1714 - if (ctrl->queue_count > 1) { 1719 + if (ctrl->ctrl.queue_count > 1) { 1715 1720 ret = blk_mq_reinit_tagset(&ctrl->tag_set); 1716 1721 if (ret) 1717 1722 goto del_dead_ctrl; ··· 1723 1728 ret = nvme_rdma_connect_io_queues(ctrl); 1724 1729 if (ret) 1725 1730 goto del_dead_ctrl; 1731 + 1732 + blk_mq_update_nr_hw_queues(&ctrl->tag_set, 1733 + ctrl->ctrl.queue_count - 1); 1726 1734 } 1727 1735 1728 1736 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 1729 1737 WARN_ON_ONCE(!changed); 1730 1738 1731 - if (ctrl->queue_count > 1) { 1732 - nvme_start_queues(&ctrl->ctrl); 1733 - nvme_queue_scan(&ctrl->ctrl); 1734 - nvme_queue_async_events(&ctrl->ctrl); 1735 - } 1739 + nvme_start_ctrl(&ctrl->ctrl); 1736 1740 1737 1741 return; 1738 1742 ··· 1779 1785 ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + 1780 1786 SG_CHUNK_SIZE * sizeof(struct scatterlist); 1781 1787 ctrl->tag_set.driver_data = ctrl; 1782 - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; 1788 + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; 1783 1789 ctrl->tag_set.timeout = NVME_IO_TIMEOUT; 1784 1790 1785 1791 ret = blk_mq_alloc_tag_set(&ctrl->tag_set); ··· 1857 1863 INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); 1858 1864 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); 1859 1865 1860 - ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ 1866 + ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ 1861 1867 ctrl->ctrl.sqsize = opts->queue_size - 1; 1862 1868 ctrl->ctrl.kato = opts->kato; 1863 1869 1864 1870 ret = -ENOMEM; 1865 - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), 1871 + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), 1866 1872 GFP_KERNEL); 1867 1873 if (!ctrl->queues) 1868 1874 goto out_uninit_ctrl; ··· 1919 1925 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); 1920 1926 mutex_unlock(&nvme_rdma_ctrl_mutex); 1921 1927 1922 - if (opts->nr_io_queues) { 1923 - nvme_queue_scan(&ctrl->ctrl); 1924 - nvme_queue_async_events(&ctrl->ctrl); 1925 - } 1928 + nvme_start_ctrl(&ctrl->ctrl); 1926 1929 1927 1930 return &ctrl->ctrl; 1928 1931 1929 1932 out_remove_admin_queue: 1930 - nvme_stop_keep_alive(&ctrl->ctrl); 1931 1933 nvme_rdma_destroy_admin_queue(ctrl); 1932 1934 out_kfree_queues: 1933 1935 kfree(ctrl->queues);

+13 -7

drivers/nvme/target/fc.c

··· 1164 1164 1165 1165 memset(acc, 0, sizeof(*acc)); 1166 1166 1167 - if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_assoc_rqst)) 1167 + /* 1168 + * FC-NVME spec changes. There are initiators sending different 1169 + * lengths as padding sizes for Create Association Cmd descriptor 1170 + * was incorrect. 1171 + * Accept anything of "minimum" length. Assume format per 1.15 1172 + * spec (with HOSTID reduced to 16 bytes), ignore how long the 1173 + * trailing pad length is. 1174 + */ 1175 + if (iod->rqstdatalen < FCNVME_LSDESC_CRA_RQST_MINLEN) 1168 1176 ret = VERR_CR_ASSOC_LEN; 1169 - else if (rqst->desc_list_len != 1170 - fcnvme_lsdesc_len( 1171 - sizeof(struct fcnvme_ls_cr_assoc_rqst))) 1177 + else if (rqst->desc_list_len < 1178 + cpu_to_be32(FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN)) 1172 1179 ret = VERR_CR_ASSOC_RQST_LEN; 1173 1180 else if (rqst->assoc_cmd.desc_tag != 1174 1181 cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) 1175 1182 ret = VERR_CR_ASSOC_CMD; 1176 - else if (rqst->assoc_cmd.desc_len != 1177 - fcnvme_lsdesc_len( 1178 - sizeof(struct fcnvme_lsdesc_cr_assoc_cmd))) 1183 + else if (rqst->assoc_cmd.desc_len < 1184 + cpu_to_be32(FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN)) 1179 1185 ret = VERR_CR_ASSOC_CMD_LEN; 1180 1186 else if (!rqst->assoc_cmd.ersp_ratio || 1181 1187 (be16_to_cpu(rqst->assoc_cmd.ersp_ratio) >=

+1 -1

drivers/nvme/target/io-cmd.c

··· 85 85 bio_set_op_attrs(bio, op, op_flags); 86 86 87 87 bio_chain(bio, prev); 88 - cookie = submit_bio(prev); 88 + submit_bio(prev); 89 89 } 90 90 91 91 sector += sg->length >> 9;

+21 -26

drivers/nvme/target/loop.c

··· 44 44 45 45 struct nvme_loop_ctrl { 46 46 struct nvme_loop_queue *queues; 47 - u32 queue_count; 48 47 49 48 struct blk_mq_tag_set admin_tag_set; 50 49 51 50 struct list_head list; 52 - u64 cap; 53 51 struct blk_mq_tag_set tag_set; 54 52 struct nvme_loop_iod async_event_iod; 55 53 struct nvme_ctrl ctrl; ··· 239 241 struct nvme_loop_ctrl *ctrl = data; 240 242 struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; 241 243 242 - BUG_ON(hctx_idx >= ctrl->queue_count); 244 + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); 243 245 244 246 hctx->driver_data = queue; 245 247 return 0; ··· 305 307 { 306 308 int i; 307 309 308 - for (i = 1; i < ctrl->queue_count; i++) 310 + for (i = 1; i < ctrl->ctrl.queue_count; i++) 309 311 nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); 310 312 } 311 313 ··· 328 330 if (ret) 329 331 goto out_destroy_queues; 330 332 331 - ctrl->queue_count++; 333 + ctrl->ctrl.queue_count++; 332 334 } 333 335 334 336 return 0; ··· 342 344 { 343 345 int i, ret; 344 346 345 - for (i = 1; i < ctrl->queue_count; i++) { 347 + for (i = 1; i < ctrl->ctrl.queue_count; i++) { 346 348 ret = nvmf_connect_io_queue(&ctrl->ctrl, i); 347 349 if (ret) 348 350 return ret; ··· 370 372 error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); 371 373 if (error) 372 374 return error; 373 - ctrl->queue_count = 1; 375 + ctrl->ctrl.queue_count = 1; 374 376 375 377 error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); 376 378 if (error) ··· 386 388 if (error) 387 389 goto out_cleanup_queue; 388 390 389 - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); 391 + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); 390 392 if (error) { 391 393 dev_err(ctrl->ctrl.device, 392 394 "prop_get NVME_REG_CAP failed\n"); ··· 394 396 } 395 397 396 398 ctrl->ctrl.sqsize = 397 - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); 399 + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); 398 400 399 - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); 401 + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 400 402 if (error) 401 403 goto out_cleanup_queue; 402 404 ··· 406 408 error = nvme_init_identify(&ctrl->ctrl); 407 409 if (error) 408 410 goto out_cleanup_queue; 409 - 410 - nvme_start_keep_alive(&ctrl->ctrl); 411 411 412 412 return 0; 413 413 ··· 420 424 421 425 static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) 422 426 { 423 - nvme_stop_keep_alive(&ctrl->ctrl); 424 - 425 - if (ctrl->queue_count > 1) { 427 + if (ctrl->ctrl.queue_count > 1) { 426 428 nvme_stop_queues(&ctrl->ctrl); 427 429 blk_mq_tagset_busy_iter(&ctrl->tag_set, 428 430 nvme_cancel_request, &ctrl->ctrl); ··· 430 436 if (ctrl->ctrl.state == NVME_CTRL_LIVE) 431 437 nvme_shutdown_ctrl(&ctrl->ctrl); 432 438 433 - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); 439 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 434 440 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 435 441 nvme_cancel_request, &ctrl->ctrl); 442 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 436 443 nvme_loop_destroy_admin_queue(ctrl); 437 444 } 438 445 ··· 442 447 struct nvme_loop_ctrl *ctrl = container_of(work, 443 448 struct nvme_loop_ctrl, delete_work); 444 449 445 - nvme_uninit_ctrl(&ctrl->ctrl); 450 + nvme_stop_ctrl(&ctrl->ctrl); 451 + nvme_remove_namespaces(&ctrl->ctrl); 446 452 nvme_loop_shutdown_ctrl(ctrl); 453 + nvme_uninit_ctrl(&ctrl->ctrl); 447 454 nvme_put_ctrl(&ctrl->ctrl); 448 455 } 449 456 ··· 493 496 bool changed; 494 497 int ret; 495 498 499 + nvme_stop_ctrl(&ctrl->ctrl); 496 500 nvme_loop_shutdown_ctrl(ctrl); 497 501 498 502 ret = nvme_loop_configure_admin_queue(ctrl); ··· 508 510 if (ret) 509 511 goto out_destroy_io; 510 512 513 + blk_mq_update_nr_hw_queues(&ctrl->tag_set, 514 + ctrl->ctrl.queue_count - 1); 515 + 511 516 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); 512 517 WARN_ON_ONCE(!changed); 513 518 514 - nvme_queue_scan(&ctrl->ctrl); 515 - nvme_queue_async_events(&ctrl->ctrl); 516 - 517 - nvme_start_queues(&ctrl->ctrl); 519 + nvme_start_ctrl(&ctrl->ctrl); 518 520 519 521 return; 520 522 ··· 557 559 ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + 558 560 SG_CHUNK_SIZE * sizeof(struct scatterlist); 559 561 ctrl->tag_set.driver_data = ctrl; 560 - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; 562 + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; 561 563 ctrl->tag_set.timeout = NVME_IO_TIMEOUT; 562 564 ctrl->ctrl.tagset = &ctrl->tag_set; 563 565 ··· 649 651 list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); 650 652 mutex_unlock(&nvme_loop_ctrl_mutex); 651 653 652 - if (opts->nr_io_queues) { 653 - nvme_queue_scan(&ctrl->ctrl); 654 - nvme_queue_async_events(&ctrl->ctrl); 655 - } 654 + nvme_start_ctrl(&ctrl->ctrl); 656 655 657 656 return &ctrl->ctrl; 658 657

+3 -2

drivers/scsi/lpfc/lpfc_scsi.c

··· 26 26 #include <linux/export.h> 27 27 #include <linux/delay.h> 28 28 #include <asm/unaligned.h> 29 + #include <linux/t10-pi.h> 29 30 #include <linux/crc-t10dif.h> 30 31 #include <net/checksum.h> 31 32 ··· 2935 2934 * First check to see if a protection data 2936 2935 * check is valid 2937 2936 */ 2938 - if ((src->ref_tag == 0xffffffff) || 2939 - (src->app_tag == 0xffff)) { 2937 + if ((src->ref_tag == T10_PI_REF_ESCAPE) || 2938 + (src->app_tag == T10_PI_APP_ESCAPE)) { 2940 2939 start_ref_tag++; 2941 2940 goto skipit; 2942 2941 }

+4 -4

drivers/scsi/qla2xxx/qla_isr.c

··· 2040 2040 * For type 3: ref & app tag is all 'f's 2041 2041 * For type 0,1,2: app tag is all 'f's 2042 2042 */ 2043 - if ((a_app_tag == 0xffff) && 2043 + if ((a_app_tag == T10_PI_APP_ESCAPE) && 2044 2044 ((scsi_get_prot_type(cmd) != SCSI_PROT_DIF_TYPE3) || 2045 - (a_ref_tag == 0xffffffff))) { 2045 + (a_ref_tag == T10_PI_REF_ESCAPE))) { 2046 2046 uint32_t blocks_done, resid; 2047 2047 sector_t lba_s = scsi_get_lba(cmd); 2048 2048 ··· 2084 2084 spt = page_address(sg_page(sg)) + sg->offset; 2085 2085 spt += j; 2086 2086 2087 - spt->app_tag = 0xffff; 2087 + spt->app_tag = T10_PI_APP_ESCAPE; 2088 2088 if (scsi_get_prot_type(cmd) == SCSI_PROT_DIF_TYPE3) 2089 - spt->ref_tag = 0xffffffff; 2089 + spt->ref_tag = T10_PI_REF_ESCAPE; 2090 2090 } 2091 2091 2092 2092 return 0;

+1 -1

drivers/target/target_core_sbc.c

··· 1450 1450 (unsigned long long)sector, sdt->guard_tag, 1451 1451 sdt->app_tag, be32_to_cpu(sdt->ref_tag)); 1452 1452 1453 - if (sdt->app_tag == cpu_to_be16(0xffff)) { 1453 + if (sdt->app_tag == T10_PI_APP_ESCAPE) { 1454 1454 dsg_off += block_size; 1455 1455 goto next; 1456 1456 }

+24 -23

include/linux/bio.h

··· 165 165 { 166 166 iter->bi_sector += bytes >> 9; 167 167 168 - if (bio_no_advance_iter(bio)) 168 + if (bio_no_advance_iter(bio)) { 169 169 iter->bi_size -= bytes; 170 - else 170 + iter->bi_done += bytes; 171 + } else { 171 172 bvec_iter_advance(bio->bi_io_vec, iter, bytes); 173 + /* TODO: It is reasonable to complete bio with error here. */ 174 + } 175 + } 176 + 177 + static inline bool bio_rewind_iter(struct bio *bio, struct bvec_iter *iter, 178 + unsigned int bytes) 179 + { 180 + iter->bi_sector -= bytes >> 9; 181 + 182 + if (bio_no_advance_iter(bio)) { 183 + iter->bi_size += bytes; 184 + iter->bi_done -= bytes; 185 + return true; 186 + } 187 + 188 + return bvec_iter_rewind(bio->bi_io_vec, iter, bytes); 172 189 } 173 190 174 191 #define __bio_for_each_segment(bvl, bio, iter, start) \ ··· 319 302 struct bio *bip_bio; /* parent bio */ 320 303 321 304 struct bvec_iter bip_iter; 322 - 323 - bio_end_io_t *bip_end_io; /* saved I/O completion fn */ 324 305 325 306 unsigned short bip_slab; /* slab the bip came from */ 326 307 unsigned short bip_vcnt; /* # of integrity bio_vecs */ ··· 737 722 bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) 738 723 739 724 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); 740 - extern void bio_integrity_free(struct bio *); 741 725 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); 742 - extern bool bio_integrity_enabled(struct bio *bio); 743 - extern int bio_integrity_prep(struct bio *); 744 - extern void bio_integrity_endio(struct bio *); 726 + extern bool bio_integrity_prep(struct bio *); 745 727 extern void bio_integrity_advance(struct bio *, unsigned int); 746 - extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); 728 + extern void bio_integrity_trim(struct bio *); 747 729 extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); 748 730 extern int bioset_integrity_create(struct bio_set *, int); 749 731 extern void bioset_integrity_free(struct bio_set *); ··· 753 741 return NULL; 754 742 } 755 743 756 - static inline bool bio_integrity_enabled(struct bio *bio) 757 - { 758 - return false; 759 - } 760 - 761 744 static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) 762 745 { 763 746 return 0; ··· 763 756 return; 764 757 } 765 758 766 - static inline int bio_integrity_prep(struct bio *bio) 759 + static inline bool bio_integrity_prep(struct bio *bio) 767 760 { 768 - return 0; 769 - } 770 - 771 - static inline void bio_integrity_free(struct bio *bio) 772 - { 773 - return; 761 + return true; 774 762 } 775 763 776 764 static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, ··· 780 778 return; 781 779 } 782 780 783 - static inline void bio_integrity_trim(struct bio *bio, unsigned int offset, 784 - unsigned int sectors) 781 + static inline void bio_integrity_trim(struct bio *bio) 785 782 { 786 783 return; 787 784 }

+36 -5

include/linux/bvec.h

··· 22 22 23 23 #include <linux/kernel.h> 24 24 #include <linux/bug.h> 25 + #include <linux/errno.h> 25 26 26 27 /* 27 28 * was unsigned short, but we might as well be ready for > 64kB I/O pages ··· 39 38 unsigned int bi_size; /* residual I/O count */ 40 39 41 40 unsigned int bi_idx; /* current index into bvl_vec */ 41 + 42 + unsigned int bi_done; /* number of bytes completed */ 42 43 43 44 unsigned int bi_bvec_done; /* number of bytes completed in 44 45 current bvec */ ··· 69 66 .bv_offset = bvec_iter_offset((bvec), (iter)), \ 70 67 }) 71 68 72 - static inline void bvec_iter_advance(const struct bio_vec *bv, 73 - struct bvec_iter *iter, 74 - unsigned bytes) 69 + static inline bool bvec_iter_advance(const struct bio_vec *bv, 70 + struct bvec_iter *iter, unsigned bytes) 75 71 { 76 - WARN_ONCE(bytes > iter->bi_size, 77 - "Attempted to advance past end of bvec iter\n"); 72 + if (WARN_ONCE(bytes > iter->bi_size, 73 + "Attempted to advance past end of bvec iter\n")) { 74 + iter->bi_size = 0; 75 + return false; 76 + } 78 77 79 78 while (bytes) { 80 79 unsigned iter_len = bvec_iter_len(bv, *iter); ··· 85 80 bytes -= len; 86 81 iter->bi_size -= len; 87 82 iter->bi_bvec_done += len; 83 + iter->bi_done += len; 88 84 89 85 if (iter->bi_bvec_done == __bvec_iter_bvec(bv, *iter)->bv_len) { 90 86 iter->bi_bvec_done = 0; 91 87 iter->bi_idx++; 92 88 } 93 89 } 90 + return true; 91 + } 92 + 93 + static inline bool bvec_iter_rewind(const struct bio_vec *bv, 94 + struct bvec_iter *iter, 95 + unsigned int bytes) 96 + { 97 + while (bytes) { 98 + unsigned len = min(bytes, iter->bi_bvec_done); 99 + 100 + if (iter->bi_bvec_done == 0) { 101 + if (WARN_ONCE(iter->bi_idx == 0, 102 + "Attempted to rewind iter beyond " 103 + "bvec's boundaries\n")) { 104 + return false; 105 + } 106 + iter->bi_idx--; 107 + iter->bi_bvec_done = __bvec_iter_bvec(bv, *iter)->bv_len; 108 + continue; 109 + } 110 + bytes -= len; 111 + iter->bi_size += len; 112 + iter->bi_bvec_done -= len; 113 + } 114 + return true; 94 115 } 95 116 96 117 #define for_each_bvec(bvl, bio_vec, iter, start) \

+22 -1

include/linux/nvme-fc.h

··· 17 17 18 18 /* 19 19 * This file contains definitions relative to FC-NVME r1.14 (16-020vB). 20 + * The fcnvme_lsdesc_cr_assoc_cmd struct reflects expected r1.16 content. 20 21 */ 21 22 22 23 #ifndef _NVME_FC_H ··· 194 193 uuid_t hostid; 195 194 u8 hostnqn[FCNVME_ASSOC_HOSTNQN_LEN]; 196 195 u8 subnqn[FCNVME_ASSOC_SUBNQN_LEN]; 197 - u8 rsvd632[384]; 196 + __be32 rsvd584[108]; /* pad to 1016 bytes, 197 + * which makes overall LS rqst 198 + * payload 1024 bytes 199 + */ 198 200 }; 201 + 202 + #define FCNVME_LSDESC_CRA_CMD_DESC_MINLEN \ 203 + offsetof(struct fcnvme_lsdesc_cr_assoc_cmd, rsvd584) 204 + 205 + #define FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN \ 206 + (FCNVME_LSDESC_CRA_CMD_DESC_MINLEN - \ 207 + offsetof(struct fcnvme_lsdesc_cr_assoc_cmd, ersp_ratio)) 208 + 209 + 199 210 200 211 /* FCNVME_LSDESC_CREATE_CONN_CMD */ 201 212 struct fcnvme_lsdesc_cr_conn_cmd { ··· 285 272 __be32 desc_list_len; 286 273 struct fcnvme_lsdesc_cr_assoc_cmd assoc_cmd; 287 274 }; 275 + 276 + #define FCNVME_LSDESC_CRA_RQST_MINLEN \ 277 + (offsetof(struct fcnvme_ls_cr_assoc_rqst, assoc_cmd) + \ 278 + FCNVME_LSDESC_CRA_CMD_DESC_MINLEN) 279 + 280 + #define FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN \ 281 + FCNVME_LSDESC_CRA_CMD_DESC_MINLEN 282 + 288 283 289 284 struct fcnvme_ls_cr_assoc_acc { 290 285 struct fcnvme_ls_acc_hdr hdr;

+2

include/linux/t10-pi.h

··· 33 33 __be32 ref_tag; /* Target LBA or indirect LBA */ 34 34 }; 35 35 36 + #define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) 37 + #define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) 36 38 37 39 extern const struct blk_integrity_profile t10_pi_type1_crc; 38 40 extern const struct blk_integrity_profile t10_pi_type1_ip;

Configure Feed

Configure Feed