Merge tag 'block-6.17-20250808' of git://git.kernel.dk/linux

+21 -45

block/bfq-iosched.c

··· 454 454 */ 455 455 static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) 456 456 { 457 - struct bfq_io_cq *icq; 458 - unsigned long flags; 459 - 460 457 if (!current->io_context) 461 458 return NULL; 462 459 463 - spin_lock_irqsave(&q->queue_lock, flags); 464 - icq = icq_to_bic(ioc_lookup_icq(q)); 465 - spin_unlock_irqrestore(&q->queue_lock, flags); 466 - 467 - return icq; 460 + return icq_to_bic(ioc_lookup_icq(q)); 468 461 } 469 462 470 463 /* ··· 694 701 { 695 702 struct bfq_data *bfqd = data->q->elevator->elevator_data; 696 703 struct bfq_io_cq *bic = bfq_bic_lookup(data->q); 697 - int depth; 698 - unsigned limit = data->q->nr_requests; 699 - unsigned int act_idx; 704 + unsigned int limit, act_idx; 700 705 701 706 /* Sync reads have full depth available */ 702 - if (op_is_sync(opf) && !op_is_write(opf)) { 703 - depth = 0; 704 - } else { 705 - depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; 706 - limit = (limit * depth) >> bfqd->full_depth_shift; 707 - } 707 + if (op_is_sync(opf) && !op_is_write(opf)) 708 + limit = data->q->nr_requests; 709 + else 710 + limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; 708 711 709 712 for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { 710 713 /* Fast path to check if bfqq is already allocated. */ ··· 714 725 * available requests and thus starve other entities. 715 726 */ 716 727 if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { 717 - depth = 1; 728 + limit = 1; 718 729 break; 719 730 } 720 731 } 732 + 721 733 bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", 722 - __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); 723 - if (depth) 724 - data->shallow_depth = depth; 734 + __func__, bfqd->wr_busy_queues, op_is_sync(opf), limit); 735 + 736 + if (limit < data->q->nr_requests) 737 + data->shallow_depth = limit; 725 738 } 726 739 727 740 static struct bfq_queue * ··· 2448 2457 unsigned int nr_segs) 2449 2458 { 2450 2459 struct bfq_data *bfqd = q->elevator->elevator_data; 2451 - struct request *free = NULL; 2452 - /* 2453 - * bfq_bic_lookup grabs the queue_lock: invoke it now and 2454 - * store its return value for later use, to avoid nesting 2455 - * queue_lock inside the bfqd->lock. We assume that the bic 2456 - * returned by bfq_bic_lookup does not go away before 2457 - * bfqd->lock is taken. 2458 - */ 2459 2460 struct bfq_io_cq *bic = bfq_bic_lookup(q); 2461 + struct request *free = NULL; 2460 2462 bool ret; 2461 2463 2462 2464 spin_lock_irq(&bfqd->lock); ··· 7112 7128 */ 7113 7129 static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) 7114 7130 { 7115 - unsigned int depth = 1U << bt->sb.shift; 7131 + unsigned int nr_requests = bfqd->queue->nr_requests; 7116 7132 7117 - bfqd->full_depth_shift = bt->sb.shift; 7118 7133 /* 7119 7134 * In-word depths if no bfq_queue is being weight-raised: 7120 7135 * leaving 25% of tags only for sync reads. ··· 7125 7142 * limit 'something'. 7126 7143 */ 7127 7144 /* no more than 50% of tags for async I/O */ 7128 - bfqd->word_depths[0][0] = max(depth >> 1, 1U); 7145 + bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U); 7129 7146 /* 7130 7147 * no more than 75% of tags for sync writes (25% extra tags 7131 7148 * w.r.t. async I/O, to prevent async I/O from starving sync 7132 7149 * writes) 7133 7150 */ 7134 - bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); 7151 + bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U); 7135 7152 7136 7153 /* 7137 7154 * In-word depths in case some bfq_queue is being weight- ··· 7141 7158 * shortage. 7142 7159 */ 7143 7160 /* no more than ~18% of tags for async I/O */ 7144 - bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); 7161 + bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); 7145 7162 /* no more than ~37% of tags for sync writes (~20% extra tags) */ 7146 - bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); 7163 + bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); 7147 7164 } 7148 7165 7149 7166 static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) ··· 7215 7232 root_group->sched_data.bfq_class_idle_last_service = jiffies; 7216 7233 } 7217 7234 7218 - static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) 7235 + static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) 7219 7236 { 7220 7237 struct bfq_data *bfqd; 7221 - struct elevator_queue *eq; 7222 7238 unsigned int i; 7223 7239 struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; 7224 7240 7225 - eq = elevator_alloc(q, e); 7226 - if (!eq) 7241 + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); 7242 + if (!bfqd) 7227 7243 return -ENOMEM; 7228 7244 7229 - bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); 7230 - if (!bfqd) { 7231 - kobject_put(&eq->kobj); 7232 - return -ENOMEM; 7233 - } 7234 7245 eq->elevator_data = bfqd; 7235 7246 7236 7247 spin_lock_irq(&q->queue_lock); ··· 7382 7405 7383 7406 out_free: 7384 7407 kfree(bfqd); 7385 - kobject_put(&eq->kobj); 7386 7408 return -ENOMEM; 7387 7409 } 7388 7410

+6 -7

block/bfq-iosched.h

··· 427 427 */ 428 428 bool saved_IO_bound; 429 429 430 - u64 saved_io_start_time; 431 - u64 saved_tot_idle_time; 432 - 433 430 /* 434 431 * Same purpose as the previous fields for the values of the 435 432 * field keeping the queue's belonging to a large burst ··· 447 450 */ 448 451 unsigned int saved_weight; 449 452 453 + u64 saved_io_start_time; 454 + u64 saved_tot_idle_time; 455 + 450 456 /* 451 457 * Similar to previous fields: save wr information. 452 458 */ ··· 457 457 unsigned long saved_last_wr_start_finish; 458 458 unsigned long saved_service_from_wr; 459 459 unsigned long saved_wr_start_at_switch_to_srt; 460 - unsigned int saved_wr_cur_max_time; 461 460 struct bfq_ttime saved_ttime; 461 + unsigned int saved_wr_cur_max_time; 462 462 463 463 /* Save also injection state */ 464 - u64 saved_last_serv_time_ns; 465 464 unsigned int saved_inject_limit; 466 465 unsigned long saved_decrease_time_jif; 466 + u64 saved_last_serv_time_ns; 467 467 468 468 /* candidate queue for a stable merge (due to close creation time) */ 469 469 struct bfq_queue *stable_merge_bfqq; ··· 813 813 * Depth limits used in bfq_limit_depth (see comments on the 814 814 * function) 815 815 */ 816 - unsigned int word_depths[2][2]; 817 - unsigned int full_depth_shift; 816 + unsigned int async_depths[2][2]; 818 817 819 818 /* 820 819 * Number of independent actuators. This is equal to 1 in

+6 -10

block/blk-ioc.c

··· 308 308 309 309 #ifdef CONFIG_BLK_ICQ 310 310 /** 311 - * ioc_lookup_icq - lookup io_cq from ioc 311 + * ioc_lookup_icq - lookup io_cq from ioc in io issue path 312 312 * @q: the associated request_queue 313 313 * 314 314 * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called 315 - * with @q->queue_lock held. 315 + * from io issue path, either return NULL if current issue io to @q for the 316 + * first time, or return a valid icq. 316 317 */ 317 318 struct io_cq *ioc_lookup_icq(struct request_queue *q) 318 319 { 319 320 struct io_context *ioc = current->io_context; 320 321 struct io_cq *icq; 321 322 322 - lockdep_assert_held(&q->queue_lock); 323 - 324 323 /* 325 324 * icq's are indexed from @ioc using radix tree and hint pointer, 326 - * both of which are protected with RCU. All removals are done 327 - * holding both q and ioc locks, and we're holding q lock - if we 328 - * find a icq which points to us, it's guaranteed to be valid. 325 + * both of which are protected with RCU, io issue path ensures that 326 + * both request_queue and current task are valid, the found icq 327 + * is guaranteed to be valid until the io is done. 329 328 */ 330 329 rcu_read_lock(); 331 330 icq = rcu_dereference(ioc->icq_hint); ··· 418 419 task_unlock(current); 419 420 } else { 420 421 get_io_context(ioc); 421 - 422 - spin_lock_irq(&q->queue_lock); 423 422 icq = ioc_lookup_icq(q); 424 - spin_unlock_irq(&q->queue_lock); 425 423 } 426 424 427 425 if (!icq) {

+152 -71

block/blk-mq-sched.c

··· 374 374 } 375 375 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); 376 376 377 - static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, 378 - struct blk_mq_hw_ctx *hctx, 379 - unsigned int hctx_idx) 380 - { 381 - if (blk_mq_is_shared_tags(q->tag_set->flags)) { 382 - hctx->sched_tags = q->sched_shared_tags; 383 - return 0; 384 - } 385 - 386 - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, 387 - q->nr_requests); 388 - 389 - if (!hctx->sched_tags) 390 - return -ENOMEM; 391 - return 0; 392 - } 393 - 394 - static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) 395 - { 396 - blk_mq_free_rq_map(queue->sched_shared_tags); 397 - queue->sched_shared_tags = NULL; 398 - } 399 - 400 377 /* called in queue's release handler, tagset has gone away */ 401 378 static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) 402 379 { 403 380 struct blk_mq_hw_ctx *hctx; 404 381 unsigned long i; 405 382 406 - queue_for_each_hw_ctx(q, hctx, i) { 407 - if (hctx->sched_tags) { 408 - if (!blk_mq_is_shared_tags(flags)) 409 - blk_mq_free_rq_map(hctx->sched_tags); 410 - hctx->sched_tags = NULL; 411 - } 412 - } 383 + queue_for_each_hw_ctx(q, hctx, i) 384 + hctx->sched_tags = NULL; 413 385 414 386 if (blk_mq_is_shared_tags(flags)) 415 - blk_mq_exit_sched_shared_tags(q); 416 - } 417 - 418 - static int blk_mq_init_sched_shared_tags(struct request_queue *queue) 419 - { 420 - struct blk_mq_tag_set *set = queue->tag_set; 421 - 422 - /* 423 - * Set initial depth at max so that we don't need to reallocate for 424 - * updating nr_requests. 425 - */ 426 - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, 427 - BLK_MQ_NO_HCTX_IDX, 428 - MAX_SCHED_RQ); 429 - if (!queue->sched_shared_tags) 430 - return -ENOMEM; 431 - 432 - blk_mq_tag_update_sched_shared_tags(queue); 433 - 434 - return 0; 387 + q->sched_shared_tags = NULL; 435 388 } 436 389 437 390 void blk_mq_sched_reg_debugfs(struct request_queue *q) ··· 411 458 mutex_unlock(&q->debugfs_mutex); 412 459 } 413 460 461 + void blk_mq_free_sched_tags(struct elevator_tags *et, 462 + struct blk_mq_tag_set *set) 463 + { 464 + unsigned long i; 465 + 466 + /* Shared tags are stored at index 0 in @tags. */ 467 + if (blk_mq_is_shared_tags(set->flags)) 468 + blk_mq_free_map_and_rqs(set, et->tags[0], BLK_MQ_NO_HCTX_IDX); 469 + else { 470 + for (i = 0; i < et->nr_hw_queues; i++) 471 + blk_mq_free_map_and_rqs(set, et->tags[i], i); 472 + } 473 + 474 + kfree(et); 475 + } 476 + 477 + void blk_mq_free_sched_tags_batch(struct xarray *et_table, 478 + struct blk_mq_tag_set *set) 479 + { 480 + struct request_queue *q; 481 + struct elevator_tags *et; 482 + 483 + lockdep_assert_held_write(&set->update_nr_hwq_lock); 484 + 485 + list_for_each_entry(q, &set->tag_list, tag_set_list) { 486 + /* 487 + * Accessing q->elevator without holding q->elevator_lock is 488 + * safe because we're holding here set->update_nr_hwq_lock in 489 + * the writer context. So, scheduler update/switch code (which 490 + * acquires the same lock but in the reader context) can't run 491 + * concurrently. 492 + */ 493 + if (q->elevator) { 494 + et = xa_load(et_table, q->id); 495 + if (unlikely(!et)) 496 + WARN_ON_ONCE(1); 497 + else 498 + blk_mq_free_sched_tags(et, set); 499 + } 500 + } 501 + } 502 + 503 + struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, 504 + unsigned int nr_hw_queues) 505 + { 506 + unsigned int nr_tags; 507 + int i; 508 + struct elevator_tags *et; 509 + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 510 + 511 + if (blk_mq_is_shared_tags(set->flags)) 512 + nr_tags = 1; 513 + else 514 + nr_tags = nr_hw_queues; 515 + 516 + et = kmalloc(sizeof(struct elevator_tags) + 517 + nr_tags * sizeof(struct blk_mq_tags *), gfp); 518 + if (!et) 519 + return NULL; 520 + /* 521 + * Default to double of smaller one between hw queue_depth and 522 + * 128, since we don't split into sync/async like the old code 523 + * did. Additionally, this is a per-hw queue depth. 524 + */ 525 + et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, 526 + BLKDEV_DEFAULT_RQ); 527 + et->nr_hw_queues = nr_hw_queues; 528 + 529 + if (blk_mq_is_shared_tags(set->flags)) { 530 + /* Shared tags are stored at index 0 in @tags. */ 531 + et->tags[0] = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, 532 + MAX_SCHED_RQ); 533 + if (!et->tags[0]) 534 + goto out; 535 + } else { 536 + for (i = 0; i < et->nr_hw_queues; i++) { 537 + et->tags[i] = blk_mq_alloc_map_and_rqs(set, i, 538 + et->nr_requests); 539 + if (!et->tags[i]) 540 + goto out_unwind; 541 + } 542 + } 543 + 544 + return et; 545 + out_unwind: 546 + while (--i >= 0) 547 + blk_mq_free_map_and_rqs(set, et->tags[i], i); 548 + out: 549 + kfree(et); 550 + return NULL; 551 + } 552 + 553 + int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, 554 + struct blk_mq_tag_set *set, unsigned int nr_hw_queues) 555 + { 556 + struct request_queue *q; 557 + struct elevator_tags *et; 558 + gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 559 + 560 + lockdep_assert_held_write(&set->update_nr_hwq_lock); 561 + 562 + list_for_each_entry(q, &set->tag_list, tag_set_list) { 563 + /* 564 + * Accessing q->elevator without holding q->elevator_lock is 565 + * safe because we're holding here set->update_nr_hwq_lock in 566 + * the writer context. So, scheduler update/switch code (which 567 + * acquires the same lock but in the reader context) can't run 568 + * concurrently. 569 + */ 570 + if (q->elevator) { 571 + et = blk_mq_alloc_sched_tags(set, nr_hw_queues); 572 + if (!et) 573 + goto out_unwind; 574 + if (xa_insert(et_table, q->id, et, gfp)) 575 + goto out_free_tags; 576 + } 577 + } 578 + return 0; 579 + out_free_tags: 580 + blk_mq_free_sched_tags(et, set); 581 + out_unwind: 582 + list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { 583 + if (q->elevator) { 584 + et = xa_load(et_table, q->id); 585 + if (et) 586 + blk_mq_free_sched_tags(et, set); 587 + } 588 + } 589 + return -ENOMEM; 590 + } 591 + 414 592 /* caller must have a reference to @e, will grab another one if successful */ 415 - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 593 + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, 594 + struct elevator_tags *et) 416 595 { 417 596 unsigned int flags = q->tag_set->flags; 418 597 struct blk_mq_hw_ctx *hctx; ··· 552 467 unsigned long i; 553 468 int ret; 554 469 555 - /* 556 - * Default to double of smaller one between hw queue_depth and 128, 557 - * since we don't split into sync/async like the old code did. 558 - * Additionally, this is a per-hw queue depth. 559 - */ 560 - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 561 - BLKDEV_DEFAULT_RQ); 470 + eq = elevator_alloc(q, e, et); 471 + if (!eq) 472 + return -ENOMEM; 473 + 474 + q->nr_requests = et->nr_requests; 562 475 563 476 if (blk_mq_is_shared_tags(flags)) { 564 - ret = blk_mq_init_sched_shared_tags(q); 565 - if (ret) 566 - return ret; 477 + /* Shared tags are stored at index 0 in @et->tags. */ 478 + q->sched_shared_tags = et->tags[0]; 479 + blk_mq_tag_update_sched_shared_tags(q); 567 480 } 568 481 569 482 queue_for_each_hw_ctx(q, hctx, i) { 570 - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); 571 - if (ret) 572 - goto err_free_map_and_rqs; 483 + if (blk_mq_is_shared_tags(flags)) 484 + hctx->sched_tags = q->sched_shared_tags; 485 + else 486 + hctx->sched_tags = et->tags[i]; 573 487 } 574 488 575 - ret = e->ops.init_sched(q, e); 489 + ret = e->ops.init_sched(q, eq); 576 490 if (ret) 577 - goto err_free_map_and_rqs; 491 + goto out; 578 492 579 493 queue_for_each_hw_ctx(q, hctx, i) { 580 494 if (e->ops.init_hctx) { 581 495 ret = e->ops.init_hctx(hctx, i); 582 496 if (ret) { 583 - eq = q->elevator; 584 - blk_mq_sched_free_rqs(q); 585 497 blk_mq_exit_sched(q, eq); 586 498 kobject_put(&eq->kobj); 587 499 return ret; ··· 587 505 } 588 506 return 0; 589 507 590 - err_free_map_and_rqs: 591 - blk_mq_sched_free_rqs(q); 508 + out: 592 509 blk_mq_sched_tags_teardown(q, flags); 593 - 510 + kobject_put(&eq->kobj); 594 511 q->elevator = NULL; 595 512 return ret; 596 513 }

+11 -1

block/blk-mq-sched.h

··· 18 18 19 19 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); 20 20 21 - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); 21 + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, 22 + struct elevator_tags *et); 22 23 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); 23 24 void blk_mq_sched_free_rqs(struct request_queue *q); 25 + 26 + struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, 27 + unsigned int nr_hw_queues); 28 + int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, 29 + struct blk_mq_tag_set *set, unsigned int nr_hw_queues); 30 + void blk_mq_free_sched_tags(struct elevator_tags *et, 31 + struct blk_mq_tag_set *set); 32 + void blk_mq_free_sched_tags_batch(struct xarray *et_table, 33 + struct blk_mq_tag_set *set); 24 34 25 35 static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) 26 36 {

+11 -5

block/blk-mq.c

··· 4974 4974 * Switch back to the elevator type stored in the xarray. 4975 4975 */ 4976 4976 static void blk_mq_elv_switch_back(struct request_queue *q, 4977 - struct xarray *elv_tbl) 4977 + struct xarray *elv_tbl, struct xarray *et_tbl) 4978 4978 { 4979 4979 struct elevator_type *e = xa_load(elv_tbl, q->id); 4980 + struct elevator_tags *t = xa_load(et_tbl, q->id); 4980 4981 4981 4982 /* The elv_update_nr_hw_queues unfreezes the queue. */ 4982 - elv_update_nr_hw_queues(q, e); 4983 + elv_update_nr_hw_queues(q, e, t); 4983 4984 4984 4985 /* Drop the reference acquired in blk_mq_elv_switch_none. */ 4985 4986 if (e) ··· 5032 5031 int prev_nr_hw_queues = set->nr_hw_queues; 5033 5032 unsigned int memflags; 5034 5033 int i; 5035 - struct xarray elv_tbl; 5034 + struct xarray elv_tbl, et_tbl; 5036 5035 5037 5036 lockdep_assert_held(&set->tag_list_lock); 5038 5037 ··· 5044 5043 return; 5045 5044 5046 5045 memflags = memalloc_noio_save(); 5046 + 5047 + xa_init(&et_tbl); 5048 + if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) 5049 + goto out_memalloc_restore; 5047 5050 5048 5051 xa_init(&elv_tbl); 5049 5052 ··· 5092 5087 switch_back: 5093 5088 /* The blk_mq_elv_switch_back unfreezes queue for us. */ 5094 5089 list_for_each_entry(q, &set->tag_list, tag_set_list) 5095 - blk_mq_elv_switch_back(q, &elv_tbl); 5090 + blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); 5096 5091 5097 5092 list_for_each_entry(q, &set->tag_list, tag_set_list) { 5098 5093 blk_mq_sysfs_register_hctxs(q); ··· 5103 5098 } 5104 5099 5105 5100 xa_destroy(&elv_tbl); 5106 - 5101 + xa_destroy(&et_tbl); 5102 + out_memalloc_restore: 5107 5103 memalloc_noio_restore(memflags); 5108 5104 5109 5105 /* Free the excess tags when nr_hw_queues shrink. */

+26 -7

block/blk-settings.c

··· 62 62 void blk_apply_bdi_limits(struct backing_dev_info *bdi, 63 63 struct queue_limits *lim) 64 64 { 65 + u64 io_opt = lim->io_opt; 66 + 65 67 /* 66 68 * For read-ahead of large files to be effective, we need to read ahead 67 - * at least twice the optimal I/O size. 69 + * at least twice the optimal I/O size. For rotational devices that do 70 + * not report an optimal I/O size (e.g. ATA HDDs), use the maximum I/O 71 + * size to avoid falling back to the (rather inefficient) small default 72 + * read-ahead size. 68 73 * 69 74 * There is no hardware limitation for the read-ahead size and the user 70 75 * might have increased the read-ahead size through sysfs, so don't ever 71 76 * decrease it. 72 77 */ 78 + if (!io_opt && (lim->features & BLK_FEAT_ROTATIONAL)) 79 + io_opt = (u64)lim->max_sectors << SECTOR_SHIFT; 80 + 73 81 bdi->ra_pages = max3(bdi->ra_pages, 74 - lim->io_opt * 2 / PAGE_SIZE, 82 + io_opt * 2 >> PAGE_SHIFT, 75 83 VM_READAHEAD_PAGES); 76 84 bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; 77 85 } ··· 320 312 pr_warn("Invalid logical block size (%d)\n", lim->logical_block_size); 321 313 return -EINVAL; 322 314 } 323 - if (lim->physical_block_size < lim->logical_block_size) 315 + if (lim->physical_block_size < lim->logical_block_size) { 324 316 lim->physical_block_size = lim->logical_block_size; 317 + } else if (!is_power_of_2(lim->physical_block_size)) { 318 + pr_warn("Invalid physical block size (%d)\n", lim->physical_block_size); 319 + return -EINVAL; 320 + } 325 321 326 322 /* 327 323 * The minimum I/O size defaults to the physical block size unless ··· 400 388 lim->max_discard_sectors = 401 389 min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); 402 390 391 + /* 392 + * When discard is not supported, discard_granularity should be reported 393 + * as 0 to userspace. 394 + */ 395 + if (lim->max_discard_sectors) 396 + lim->discard_granularity = 397 + max(lim->discard_granularity, lim->physical_block_size); 398 + else 399 + lim->discard_granularity = 0; 400 + 403 401 if (!lim->max_discard_segments) 404 402 lim->max_discard_segments = 1; 405 - 406 - if (lim->discard_granularity < lim->physical_block_size) 407 - lim->discard_granularity = lim->physical_block_size; 408 403 409 404 /* 410 405 * By default there is no limit on the segment boundary alignment, ··· 868 849 } 869 850 870 851 /* chunk_sectors a multiple of the physical block size? */ 871 - if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { 852 + if (t->chunk_sectors % (t->physical_block_size >> SECTOR_SHIFT)) { 872 853 t->chunk_sectors = 0; 873 854 t->flags |= BLK_FLAG_MISALIGNED; 874 855 ret = -1;

+3 -1

block/blk.h

··· 12 12 #include "blk-crypto-internal.h" 13 13 14 14 struct elevator_type; 15 + struct elevator_tags; 15 16 16 17 /* 17 18 * Default upper limit for the software max_sectors limit used for regular I/Os. ··· 331 330 332 331 bool blk_insert_flush(struct request *rq); 333 332 334 - void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); 333 + void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, 334 + struct elevator_tags *t); 335 335 void elevator_set_default(struct request_queue *q); 336 336 void elevator_set_none(struct request_queue *q); 337 337

+31 -7

block/elevator.c

··· 54 54 struct elevator_queue *old; 55 55 /* for registering new elevator */ 56 56 struct elevator_queue *new; 57 + /* holds sched tags data */ 58 + struct elevator_tags *et; 57 59 }; 58 60 59 61 static DEFINE_SPINLOCK(elv_list_lock); ··· 134 132 static const struct kobj_type elv_ktype; 135 133 136 134 struct elevator_queue *elevator_alloc(struct request_queue *q, 137 - struct elevator_type *e) 135 + struct elevator_type *e, struct elevator_tags *et) 138 136 { 139 137 struct elevator_queue *eq; 140 138 ··· 147 145 kobject_init(&eq->kobj, &elv_ktype); 148 146 mutex_init(&eq->sysfs_lock); 149 147 hash_init(eq->hash); 148 + eq->et = et; 150 149 151 150 return eq; 152 151 } 153 - EXPORT_SYMBOL(elevator_alloc); 154 152 155 153 static void elevator_release(struct kobject *kobj) 156 154 { ··· 168 166 lockdep_assert_held(&q->elevator_lock); 169 167 170 168 ioc_clear_queue(q); 171 - blk_mq_sched_free_rqs(q); 172 169 173 170 mutex_lock(&e->sysfs_lock); 174 171 blk_mq_exit_sched(q, e); ··· 593 592 } 594 593 595 594 if (new_e) { 596 - ret = blk_mq_init_sched(q, new_e); 595 + ret = blk_mq_init_sched(q, new_e, ctx->et); 597 596 if (ret) 598 597 goto out_unfreeze; 599 598 ctx->new = q->elevator; ··· 628 627 elevator_exit(q); 629 628 mutex_unlock(&q->elevator_lock); 630 629 blk_mq_unfreeze_queue(q, memflags); 631 - if (e) 630 + if (e) { 631 + blk_mq_free_sched_tags(e->et, q->tag_set); 632 632 kobject_put(&e->kobj); 633 + } 633 634 } 634 635 635 636 static int elevator_change_done(struct request_queue *q, ··· 644 641 &ctx->old->flags); 645 642 646 643 elv_unregister_queue(q, ctx->old); 644 + blk_mq_free_sched_tags(ctx->old->et, q->tag_set); 647 645 kobject_put(&ctx->old->kobj); 648 646 if (enable_wbt) 649 647 wbt_enable_default(q->disk); ··· 663 659 static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) 664 660 { 665 661 unsigned int memflags; 662 + struct blk_mq_tag_set *set = q->tag_set; 666 663 int ret = 0; 667 664 668 - lockdep_assert_held(&q->tag_set->update_nr_hwq_lock); 665 + lockdep_assert_held(&set->update_nr_hwq_lock); 666 + 667 + if (strncmp(ctx->name, "none", 4)) { 668 + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); 669 + if (!ctx->et) 670 + return -ENOMEM; 671 + } 669 672 670 673 memflags = blk_mq_freeze_queue(q); 671 674 /* ··· 692 681 blk_mq_unfreeze_queue(q, memflags); 693 682 if (!ret) 694 683 ret = elevator_change_done(q, ctx); 684 + /* 685 + * Free sched tags if it's allocated but we couldn't switch elevator. 686 + */ 687 + if (ctx->et && !ctx->new) 688 + blk_mq_free_sched_tags(ctx->et, set); 695 689 696 690 return ret; 697 691 } ··· 705 689 * The I/O scheduler depends on the number of hardware queues, this forces a 706 690 * reattachment when nr_hw_queues changes. 707 691 */ 708 - void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) 692 + void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, 693 + struct elevator_tags *t) 709 694 { 695 + struct blk_mq_tag_set *set = q->tag_set; 710 696 struct elv_change_ctx ctx = {}; 711 697 int ret = -ENODEV; 712 698 ··· 716 698 717 699 if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { 718 700 ctx.name = e->elevator_name; 701 + ctx.et = t; 719 702 720 703 mutex_lock(&q->elevator_lock); 721 704 /* force to reattach elevator after nr_hw_queue is updated */ ··· 726 707 blk_mq_unfreeze_queue_nomemrestore(q); 727 708 if (!ret) 728 709 WARN_ON_ONCE(elevator_change_done(q, &ctx)); 710 + /* 711 + * Free sched tags if it's allocated but we couldn't switch elevator. 712 + */ 713 + if (t && !ctx.new) 714 + blk_mq_free_sched_tags(t, set); 729 715 } 730 716 731 717 /*

+13 -3

block/elevator.h

··· 23 23 struct blk_mq_alloc_data; 24 24 struct blk_mq_hw_ctx; 25 25 26 + struct elevator_tags { 27 + /* num. of hardware queues for which tags are allocated */ 28 + unsigned int nr_hw_queues; 29 + /* depth used while allocating tags */ 30 + unsigned int nr_requests; 31 + /* shared tag is stored at index 0 */ 32 + struct blk_mq_tags *tags[]; 33 + }; 34 + 26 35 struct elevator_mq_ops { 27 - int (*init_sched)(struct request_queue *, struct elevator_type *); 36 + int (*init_sched)(struct request_queue *, struct elevator_queue *); 28 37 void (*exit_sched)(struct elevator_queue *); 29 38 int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); 30 39 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); ··· 122 113 struct elevator_queue 123 114 { 124 115 struct elevator_type *type; 116 + struct elevator_tags *et; 125 117 void *elevator_data; 126 118 struct kobject kobj; 127 119 struct mutex sysfs_lock; ··· 162 152 ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); 163 153 164 154 extern bool elv_bio_merge_ok(struct request *, struct bio *); 165 - extern struct elevator_queue *elevator_alloc(struct request_queue *, 166 - struct elevator_type *); 155 + struct elevator_queue *elevator_alloc(struct request_queue *, 156 + struct elevator_type *, struct elevator_tags *); 167 157 168 158 /* 169 159 * Helper functions.

+4 -16

block/kyber-iosched.c

··· 157 157 */ 158 158 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; 159 159 160 - /* 161 - * Async request percentage, converted to per-word depth for 162 - * sbitmap_get_shallow(). 163 - */ 160 + /* Number of allowed async requests. */ 164 161 unsigned int async_depth; 165 162 166 163 struct kyber_cpu_latency __percpu *cpu_latency; ··· 399 402 return ERR_PTR(ret); 400 403 } 401 404 402 - static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) 405 + static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) 403 406 { 404 407 struct kyber_queue_data *kqd; 405 - struct elevator_queue *eq; 406 - 407 - eq = elevator_alloc(q, e); 408 - if (!eq) 409 - return -ENOMEM; 410 408 411 409 kqd = kyber_queue_data_alloc(q); 412 - if (IS_ERR(kqd)) { 413 - kobject_put(&eq->kobj); 410 + if (IS_ERR(kqd)) 414 411 return PTR_ERR(kqd); 415 - } 416 412 417 413 blk_stat_enable_accounting(q); 418 414 ··· 444 454 { 445 455 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; 446 456 struct blk_mq_tags *tags = hctx->sched_tags; 447 - unsigned int shift = tags->bitmap_tags.sb.shift; 448 457 449 - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; 450 - 458 + kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; 451 459 sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); 452 460 } 453 461

+3 -27

block/mq-deadline.c

··· 488 488 } 489 489 490 490 /* 491 - * 'depth' is a number in the range 1..INT_MAX representing a number of 492 - * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since 493 - * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). 494 - * Values larger than q->nr_requests have the same effect as q->nr_requests. 495 - */ 496 - static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) 497 - { 498 - struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; 499 - const unsigned int nrr = hctx->queue->nr_requests; 500 - 501 - return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; 502 - } 503 - 504 - /* 505 491 * Called by __blk_mq_alloc_request(). The shallow_depth value set by this 506 492 * function is used by __blk_mq_get_tag(). 507 493 */ ··· 503 517 * Throttle asynchronous requests and writes such that these requests 504 518 * do not block the allocation of synchronous requests. 505 519 */ 506 - data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); 520 + data->shallow_depth = dd->async_depth; 507 521 } 508 522 509 523 /* Called by blk_mq_update_nr_requests(). */ ··· 554 568 /* 555 569 * initialize elevator private data (deadline_data). 556 570 */ 557 - static int dd_init_sched(struct request_queue *q, struct elevator_type *e) 571 + static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) 558 572 { 559 573 struct deadline_data *dd; 560 - struct elevator_queue *eq; 561 574 enum dd_prio prio; 562 - int ret = -ENOMEM; 563 - 564 - eq = elevator_alloc(q, e); 565 - if (!eq) 566 - return ret; 567 575 568 576 dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); 569 577 if (!dd) 570 - goto put_eq; 578 + return -ENOMEM; 571 579 572 580 eq->elevator_data = dd; 573 581 ··· 588 608 589 609 q->elevator = eq; 590 610 return 0; 591 - 592 - put_eq: 593 - kobject_put(&eq->kobj); 594 - return ret; 595 611 } 596 612 597 613 /*

+2 -1

drivers/block/zloop.c

··· 700 700 struct zloop_device *zlo = disk->private_data; 701 701 unsigned int i; 702 702 703 + blk_mq_free_tag_set(&zlo->tag_set); 704 + 703 705 for (i = 0; i < zlo->nr_zones; i++) { 704 706 struct zloop_zone *zone = &zlo->zones[i]; 705 707 ··· 1082 1080 1083 1081 del_gendisk(zlo->disk); 1084 1082 put_disk(zlo->disk); 1085 - blk_mq_free_tag_set(&zlo->tag_set); 1086 1083 1087 1084 pr_info("Removed device %d\n", opts->id); 1088 1085

+21 -21

drivers/md/dm-raid.c

··· 438 438 /* Return true, if raid set in @rs is recovering */ 439 439 static bool rs_is_recovering(struct raid_set *rs) 440 440 { 441 - return rs->md.recovery_cp < rs->md.dev_sectors; 441 + return rs->md.resync_offset < rs->md.dev_sectors; 442 442 } 443 443 444 444 /* Return true, if raid set in @rs is reshaping */ ··· 768 768 rs->md.layout = raid_type->algorithm; 769 769 rs->md.new_layout = rs->md.layout; 770 770 rs->md.delta_disks = 0; 771 - rs->md.recovery_cp = MaxSector; 771 + rs->md.resync_offset = MaxSector; 772 772 773 773 for (i = 0; i < raid_devs; i++) 774 774 md_rdev_init(&rs->dev[i].rdev); ··· 912 912 rs->md.external = 0; 913 913 rs->md.persistent = 1; 914 914 rs->md.major_version = 2; 915 - } else if (rebuild && !rs->md.recovery_cp) { 915 + } else if (rebuild && !rs->md.resync_offset) { 916 916 /* 917 917 * Without metadata, we will not be able to tell if the array 918 918 * is in-sync or not - we must assume it is not. Therefore, ··· 1695 1695 { 1696 1696 /* raid0 does not recover */ 1697 1697 if (rs_is_raid0(rs)) 1698 - rs->md.recovery_cp = MaxSector; 1698 + rs->md.resync_offset = MaxSector; 1699 1699 /* 1700 1700 * A raid6 set has to be recovered either 1701 1701 * completely or for the grown part to 1702 1702 * ensure proper parity and Q-Syndrome 1703 1703 */ 1704 1704 else if (rs_is_raid6(rs)) 1705 - rs->md.recovery_cp = dev_sectors; 1705 + rs->md.resync_offset = dev_sectors; 1706 1706 /* 1707 1707 * Other raid set types may skip recovery 1708 1708 * depending on the 'nosync' flag. 1709 1709 */ 1710 1710 else 1711 - rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) 1711 + rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) 1712 1712 ? MaxSector : dev_sectors; 1713 1713 } 1714 1714 ··· 2143 2143 sb->events = cpu_to_le64(mddev->events); 2144 2144 2145 2145 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); 2146 - sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); 2146 + sb->array_resync_offset = cpu_to_le64(mddev->resync_offset); 2147 2147 2148 2148 sb->level = cpu_to_le32(mddev->level); 2149 2149 sb->layout = cpu_to_le32(mddev->layout); ··· 2334 2334 } 2335 2335 2336 2336 if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) 2337 - mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); 2337 + mddev->resync_offset = le64_to_cpu(sb->array_resync_offset); 2338 2338 2339 2339 /* 2340 2340 * During load, we set FirstUse if a new superblock was written. 2341 2341 * There are two reasons we might not have a superblock: 2342 2342 * 1) The raid set is brand new - in which case, all of the 2343 2343 * devices must have their In_sync bit set. Also, 2344 - * recovery_cp must be 0, unless forced. 2344 + * resync_offset must be 0, unless forced. 2345 2345 * 2) This is a new device being added to an old raid set 2346 2346 * and the new device needs to be rebuilt - in which 2347 2347 * case the In_sync bit will /not/ be set and 2348 - * recovery_cp must be MaxSector. 2348 + * resync_offset must be MaxSector. 2349 2349 * 3) This is/are a new device(s) being added to an old 2350 2350 * raid set during takeover to a higher raid level 2351 2351 * to provide capacity for redundancy or during reshape ··· 2390 2390 new_devs > 1 ? "s" : ""); 2391 2391 return -EINVAL; 2392 2392 } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { 2393 - DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", 2394 - (unsigned long long) mddev->recovery_cp); 2393 + DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)", 2394 + (unsigned long long) mddev->resync_offset); 2395 2395 return -EINVAL; 2396 2396 } else if (rs_is_reshaping(rs)) { 2397 2397 DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", ··· 2700 2700 } 2701 2701 out: 2702 2702 /* 2703 - * Raise recovery_cp in case data_offset != 0 to 2703 + * Raise resync_offset in case data_offset != 0 to 2704 2704 * avoid false recovery positives in the constructor. 2705 2705 */ 2706 - if (rs->md.recovery_cp < rs->md.dev_sectors) 2707 - rs->md.recovery_cp += rs->dev[0].rdev.data_offset; 2706 + if (rs->md.resync_offset < rs->md.dev_sectors) 2707 + rs->md.resync_offset += rs->dev[0].rdev.data_offset; 2708 2708 2709 2709 /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ 2710 2710 rdev_for_each(rdev, &rs->md) { ··· 2759 2759 } 2760 2760 2761 2761 clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 2762 - mddev->recovery_cp = MaxSector; 2762 + mddev->resync_offset = MaxSector; 2763 2763 2764 2764 while (d--) { 2765 2765 rdev = &rs->dev[d].rdev; ··· 2767 2767 if (test_bit(d, (void *) rs->rebuild_disks)) { 2768 2768 clear_bit(In_sync, &rdev->flags); 2769 2769 clear_bit(Faulty, &rdev->flags); 2770 - mddev->recovery_cp = rdev->recovery_offset = 0; 2770 + mddev->resync_offset = rdev->recovery_offset = 0; 2771 2771 /* Bitmap has to be created when we do an "up" takeover */ 2772 2772 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 2773 2773 } ··· 3225 3225 if (r) 3226 3226 goto bad; 3227 3227 3228 - rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); 3228 + rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors); 3229 3229 } else { 3230 3230 /* This is no size change or it is shrinking, update size and record in superblocks */ 3231 3231 r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); ··· 3449 3449 3450 3450 } else { 3451 3451 if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) 3452 - r = mddev->recovery_cp; 3452 + r = mddev->resync_offset; 3453 3453 else 3454 3454 r = mddev->curr_resync_completed; 3455 3455 ··· 4077 4077 } 4078 4078 4079 4079 /* Check for any resize/reshape on @rs and adjust/initiate */ 4080 - if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { 4080 + if (mddev->resync_offset && mddev->resync_offset < MaxSector) { 4081 4081 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4082 - mddev->resync_min = mddev->recovery_cp; 4082 + mddev->resync_min = mddev->resync_offset; 4083 4083 if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) 4084 4084 mddev->resync_max_sectors = mddev->dev_sectors; 4085 4085 }

+4 -4

drivers/md/md-bitmap.c

··· 1987 1987 1988 1988 md_bitmap_set_memory_bits(bitmap, sec, 1); 1989 1989 md_bitmap_file_set_bit(bitmap, sec); 1990 - if (sec < bitmap->mddev->recovery_cp) 1990 + if (sec < bitmap->mddev->resync_offset) 1991 1991 /* We are asserting that the array is dirty, 1992 - * so move the recovery_cp address back so 1992 + * so move the resync_offset address back so 1993 1993 * that it is obvious that it is dirty 1994 1994 */ 1995 - bitmap->mddev->recovery_cp = sec; 1995 + bitmap->mddev->resync_offset = sec; 1996 1996 } 1997 1997 } 1998 1998 ··· 2258 2258 || bitmap->events_cleared == mddev->events) 2259 2259 /* no need to keep dirty bits to optimise a 2260 2260 * re-add of a missing device */ 2261 - start = mddev->recovery_cp; 2261 + start = mddev->resync_offset; 2262 2262 2263 2263 mutex_lock(&mddev->bitmap_info.mutex); 2264 2264 err = md_bitmap_init_from_disk(bitmap, start);

+8 -8

drivers/md/md-cluster.c

··· 337 337 md_wakeup_thread(mddev->sync_thread); 338 338 339 339 if (hi > 0) { 340 - if (lo < mddev->recovery_cp) 341 - mddev->recovery_cp = lo; 340 + if (lo < mddev->resync_offset) 341 + mddev->resync_offset = lo; 342 342 /* wake up thread to continue resync in case resync 343 343 * is not finished */ 344 - if (mddev->recovery_cp != MaxSector) { 344 + if (mddev->resync_offset != MaxSector) { 345 345 /* 346 346 * clear the REMOTE flag since we will launch 347 347 * resync thread in current node. ··· 863 863 lockres_free(bm_lockres); 864 864 continue; 865 865 } 866 - if ((hi > 0) && (lo < mddev->recovery_cp)) { 866 + if ((hi > 0) && (lo < mddev->resync_offset)) { 867 867 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 868 - mddev->recovery_cp = lo; 868 + mddev->resync_offset = lo; 869 869 md_check_recovery(mddev); 870 870 } 871 871 ··· 1027 1027 * Also, we should send BITMAP_NEEDS_SYNC message in 1028 1028 * case reshaping is interrupted. 1029 1029 */ 1030 - if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || 1030 + if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) || 1031 1031 (mddev->reshape_position != MaxSector && 1032 1032 test_bit(MD_CLOSING, &mddev->flags))) 1033 1033 resync_bitmap(mddev); ··· 1605 1605 pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); 1606 1606 goto out; 1607 1607 } 1608 - if ((hi > 0) && (lo < mddev->recovery_cp)) 1609 - mddev->recovery_cp = lo; 1608 + if ((hi > 0) && (lo < mddev->resync_offset)) 1609 + mddev->resync_offset = lo; 1610 1610 } 1611 1611 out: 1612 1612 return err;

+44 -29

drivers/md/md.c

··· 637 637 return; 638 638 639 639 /* 640 + * If array is freed by stopping array, MD_DELETED is set by 641 + * do_md_stop(), MD_DELETED is still set here in case mddev is freed 642 + * directly by closing a mddev that is created by create_on_open. 643 + */ 644 + set_bit(MD_DELETED, &mddev->flags); 645 + /* 640 646 * Call queue_work inside the spinlock so that flush_workqueue() after 641 647 * mddev_find will succeed in waiting for the work to be done. 642 648 */ ··· 1415 1409 mddev->layout = -1; 1416 1410 1417 1411 if (sb->state & (1<<MD_SB_CLEAN)) 1418 - mddev->recovery_cp = MaxSector; 1412 + mddev->resync_offset = MaxSector; 1419 1413 else { 1420 1414 if (sb->events_hi == sb->cp_events_hi && 1421 1415 sb->events_lo == sb->cp_events_lo) { 1422 - mddev->recovery_cp = sb->recovery_cp; 1416 + mddev->resync_offset = sb->resync_offset; 1423 1417 } else 1424 - mddev->recovery_cp = 0; 1418 + mddev->resync_offset = 0; 1425 1419 } 1426 1420 1427 1421 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); ··· 1547 1541 mddev->minor_version = sb->minor_version; 1548 1542 if (mddev->in_sync) 1549 1543 { 1550 - sb->recovery_cp = mddev->recovery_cp; 1544 + sb->resync_offset = mddev->resync_offset; 1551 1545 sb->cp_events_hi = (mddev->events>>32); 1552 1546 sb->cp_events_lo = (u32)mddev->events; 1553 - if (mddev->recovery_cp == MaxSector) 1547 + if (mddev->resync_offset == MaxSector) 1554 1548 sb->state = (1<< MD_SB_CLEAN); 1555 1549 } else 1556 - sb->recovery_cp = 0; 1550 + sb->resync_offset = 0; 1557 1551 1558 1552 sb->layout = mddev->layout; 1559 1553 sb->chunk_size = mddev->chunk_sectors << 9; ··· 1901 1895 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1902 1896 mddev->reshape_backwards = 0; 1903 1897 1904 - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1898 + mddev->resync_offset = le64_to_cpu(sb->resync_offset); 1905 1899 memcpy(mddev->uuid, sb->set_uuid, 16); 1906 1900 1907 1901 mddev->max_disks = (4096-256)/2; ··· 2087 2081 sb->utime = cpu_to_le64((__u64)mddev->utime); 2088 2082 sb->events = cpu_to_le64(mddev->events); 2089 2083 if (mddev->in_sync) 2090 - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2084 + sb->resync_offset = cpu_to_le64(mddev->resync_offset); 2091 2085 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2092 2086 sb->resync_offset = cpu_to_le64(MaxSector); 2093 2087 else ··· 2767 2761 /* If this is just a dirty<->clean transition, and the array is clean 2768 2762 * and 'events' is odd, we can roll back to the previous clean state */ 2769 2763 if (nospares 2770 - && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2764 + && (mddev->in_sync && mddev->resync_offset == MaxSector) 2771 2765 && mddev->can_decrease_events 2772 2766 && mddev->events != 1) { 2773 2767 mddev->events--; ··· 4303 4297 static ssize_t 4304 4298 resync_start_show(struct mddev *mddev, char *page) 4305 4299 { 4306 - if (mddev->recovery_cp == MaxSector) 4300 + if (mddev->resync_offset == MaxSector) 4307 4301 return sprintf(page, "none\n"); 4308 - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4302 + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); 4309 4303 } 4310 4304 4311 4305 static ssize_t ··· 4331 4325 err = -EBUSY; 4332 4326 4333 4327 if (!err) { 4334 - mddev->recovery_cp = n; 4328 + mddev->resync_offset = n; 4335 4329 if (mddev->pers) 4336 4330 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4337 4331 } ··· 6423 6417 mddev->external_size = 0; 6424 6418 mddev->dev_sectors = 0; 6425 6419 mddev->raid_disks = 0; 6426 - mddev->recovery_cp = 0; 6420 + mddev->resync_offset = 0; 6427 6421 mddev->resync_min = 0; 6428 6422 mddev->resync_max = MaxSector; 6429 6423 mddev->reshape_position = MaxSector; ··· 7368 7362 * openned 7369 7363 */ 7370 7364 if (info->state & (1<<MD_SB_CLEAN)) 7371 - mddev->recovery_cp = MaxSector; 7365 + mddev->resync_offset = MaxSector; 7372 7366 else 7373 - mddev->recovery_cp = 0; 7367 + mddev->resync_offset = 0; 7374 7368 mddev->persistent = ! info->not_persistent; 7375 7369 mddev->external = 0; 7376 7370 ··· 8309 8303 seq_printf(seq, "\tresync=REMOTE"); 8310 8304 return 1; 8311 8305 } 8312 - if (mddev->recovery_cp < MaxSector) { 8306 + if (mddev->resync_offset < MaxSector) { 8313 8307 seq_printf(seq, "\tresync=PENDING"); 8314 8308 return 1; 8315 8309 } ··· 8952 8946 return mddev->resync_min; 8953 8947 case ACTION_RESYNC: 8954 8948 if (!mddev->bitmap) 8955 - return mddev->recovery_cp; 8949 + return mddev->resync_offset; 8956 8950 return 0; 8957 8951 case ACTION_RESHAPE: 8958 8952 /* ··· 9190 9184 atomic_read(&mddev->recovery_active) == 0); 9191 9185 mddev->curr_resync_completed = j; 9192 9186 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9193 - j > mddev->recovery_cp) 9194 - mddev->recovery_cp = j; 9187 + j > mddev->resync_offset) 9188 + mddev->resync_offset = j; 9195 9189 update_time = jiffies; 9196 9190 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9197 9191 sysfs_notify_dirent_safe(mddev->sysfs_completed); ··· 9311 9305 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9312 9306 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9313 9307 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9314 - if (mddev->curr_resync >= mddev->recovery_cp) { 9308 + if (mddev->curr_resync >= mddev->resync_offset) { 9315 9309 pr_debug("md: checkpointing %s of %s.\n", 9316 9310 desc, mdname(mddev)); 9317 9311 if (test_bit(MD_RECOVERY_ERROR, 9318 9312 &mddev->recovery)) 9319 - mddev->recovery_cp = 9313 + mddev->resync_offset = 9320 9314 mddev->curr_resync_completed; 9321 9315 else 9322 - mddev->recovery_cp = 9316 + mddev->resync_offset = 9323 9317 mddev->curr_resync; 9324 9318 } 9325 9319 } else 9326 - mddev->recovery_cp = MaxSector; 9320 + mddev->resync_offset = MaxSector; 9327 9321 } else { 9328 9322 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9329 9323 mddev->curr_resync = MaxSector; ··· 9427 9421 9428 9422 static bool rdev_addable(struct md_rdev *rdev) 9429 9423 { 9424 + struct mddev *mddev; 9425 + 9426 + mddev = READ_ONCE(rdev->mddev); 9427 + if (!mddev) 9428 + return false; 9429 + 9430 9430 /* rdev is already used, don't add it again. */ 9431 9431 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9432 9432 test_bit(Faulty, &rdev->flags)) ··· 9443 9431 return true; 9444 9432 9445 9433 /* Allow to add if array is read-write. */ 9446 - if (md_is_rdwr(rdev->mddev)) 9434 + if (md_is_rdwr(mddev)) 9447 9435 return true; 9448 9436 9449 9437 /* ··· 9545 9533 } 9546 9534 9547 9535 /* Check if resync is in progress. */ 9548 - if (mddev->recovery_cp < MaxSector) { 9536 + if (mddev->resync_offset < MaxSector) { 9549 9537 remove_spares(mddev, NULL); 9550 9538 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9551 9539 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); ··· 9726 9714 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9727 9715 (mddev->external == 0 && mddev->safemode == 1) || 9728 9716 (mddev->safemode == 2 9729 - && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9717 + && !mddev->in_sync && mddev->resync_offset == MaxSector) 9730 9718 )) 9731 9719 return; 9732 9720 ··· 9783 9771 * remove disk. 9784 9772 */ 9785 9773 rdev_for_each_safe(rdev, tmp, mddev) { 9786 - if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9787 - rdev->raid_disk < 0) 9774 + if (rdev->raid_disk < 0 && 9775 + test_and_clear_bit(ClusterRemove, &rdev->flags)) 9788 9776 md_kick_rdev_from_array(rdev); 9789 9777 } 9790 9778 } ··· 10090 10078 10091 10079 /* Check for change of roles in the active devices */ 10092 10080 rdev_for_each_safe(rdev2, tmp, mddev) { 10093 - if (test_bit(Faulty, &rdev2->flags)) 10081 + if (test_bit(Faulty, &rdev2->flags)) { 10082 + if (test_bit(ClusterRemove, &rdev2->flags)) 10083 + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10094 10084 continue; 10085 + } 10095 10086 10096 10087 /* Check if the roles changed */ 10097 10088 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);

+1 -1

drivers/md/md.h

··· 523 523 unsigned long normal_io_events; /* IO event timestamp */ 524 524 atomic_t recovery_active; /* blocks scheduled, but not written */ 525 525 wait_queue_head_t recovery_wait; 526 - sector_t recovery_cp; 526 + sector_t resync_offset; 527 527 sector_t resync_min; /* user requested sync 528 528 * starts here */ 529 529 sector_t resync_max; /* resync should pause

+3 -3

drivers/md/raid0.c

··· 674 674 mddev->raid_disks--; 675 675 mddev->delta_disks = -1; 676 676 /* make sure it will be not marked as dirty */ 677 - mddev->recovery_cp = MaxSector; 677 + mddev->resync_offset = MaxSector; 678 678 mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); 679 679 680 680 create_strip_zones(mddev, &priv_conf); ··· 717 717 mddev->raid_disks += mddev->delta_disks; 718 718 mddev->degraded = 0; 719 719 /* make sure it will be not marked as dirty */ 720 - mddev->recovery_cp = MaxSector; 720 + mddev->resync_offset = MaxSector; 721 721 mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); 722 722 723 723 create_strip_zones(mddev, &priv_conf); ··· 760 760 mddev->delta_disks = 1 - mddev->raid_disks; 761 761 mddev->raid_disks = 1; 762 762 /* make sure it will be not marked as dirty */ 763 - mddev->recovery_cp = MaxSector; 763 + mddev->resync_offset = MaxSector; 764 764 mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); 765 765 766 766 create_strip_zones(mddev, &priv_conf);

+1 -1

drivers/md/raid1-10.c

··· 283 283 static inline bool raid1_should_read_first(struct mddev *mddev, 284 284 sector_t this_sector, int len) 285 285 { 286 - if ((mddev->recovery_cp < this_sector + len)) 286 + if ((mddev->resync_offset < this_sector + len)) 287 287 return true; 288 288 289 289 if (mddev_is_clustered(mddev) &&

+35 -59

drivers/md/raid1.c

··· 127 127 return get_resync_pages(bio)->raid_bio; 128 128 } 129 129 130 - static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 130 + static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf) 131 131 { 132 - struct pool_info *pi = data; 133 - int size = offsetof(struct r1bio, bios[pi->raid_disks]); 132 + int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]); 134 133 135 134 /* allocate a r1bio with room for raid_disks entries in the bios array */ 136 135 return kzalloc(size, gfp_flags); ··· 144 145 145 146 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 146 147 { 147 - struct pool_info *pi = data; 148 + struct r1conf *conf = data; 148 149 struct r1bio *r1_bio; 149 150 struct bio *bio; 150 151 int need_pages; 151 152 int j; 152 153 struct resync_pages *rps; 153 154 154 - r1_bio = r1bio_pool_alloc(gfp_flags, pi); 155 + r1_bio = r1bio_pool_alloc(gfp_flags, conf); 155 156 if (!r1_bio) 156 157 return NULL; 157 158 158 - rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), 159 + rps = kmalloc_array(conf->raid_disks * 2, sizeof(struct resync_pages), 159 160 gfp_flags); 160 161 if (!rps) 161 162 goto out_free_r1bio; ··· 163 164 /* 164 165 * Allocate bios : 1 for reading, n-1 for writing 165 166 */ 166 - for (j = pi->raid_disks ; j-- ; ) { 167 + for (j = conf->raid_disks * 2; j-- ; ) { 167 168 bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 168 169 if (!bio) 169 170 goto out_free_bio; ··· 176 177 * If this is a user-requested check/repair, allocate 177 178 * RESYNC_PAGES for each bio. 178 179 */ 179 - if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) 180 - need_pages = pi->raid_disks; 180 + if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery)) 181 + need_pages = conf->raid_disks * 2; 181 182 else 182 183 need_pages = 1; 183 - for (j = 0; j < pi->raid_disks; j++) { 184 + for (j = 0; j < conf->raid_disks * 2; j++) { 184 185 struct resync_pages *rp = &rps[j]; 185 186 186 187 bio = r1_bio->bios[j]; ··· 206 207 resync_free_pages(&rps[j]); 207 208 208 209 out_free_bio: 209 - while (++j < pi->raid_disks) { 210 + while (++j < conf->raid_disks * 2) { 210 211 bio_uninit(r1_bio->bios[j]); 211 212 kfree(r1_bio->bios[j]); 212 213 } ··· 219 220 220 221 static void r1buf_pool_free(void *__r1_bio, void *data) 221 222 { 222 - struct pool_info *pi = data; 223 + struct r1conf *conf = data; 223 224 int i; 224 225 struct r1bio *r1bio = __r1_bio; 225 226 struct resync_pages *rp = NULL; 226 227 227 - for (i = pi->raid_disks; i--; ) { 228 + for (i = conf->raid_disks * 2; i--; ) { 228 229 rp = get_resync_pages(r1bio->bios[i]); 229 230 resync_free_pages(rp); 230 231 bio_uninit(r1bio->bios[i]); ··· 254 255 struct r1conf *conf = r1_bio->mddev->private; 255 256 256 257 put_all_bios(conf, r1_bio); 257 - mempool_free(r1_bio, &conf->r1bio_pool); 258 + mempool_free(r1_bio, conf->r1bio_pool); 258 259 } 259 260 260 261 static void put_buf(struct r1bio *r1_bio) ··· 1304 1305 struct r1conf *conf = mddev->private; 1305 1306 struct r1bio *r1_bio; 1306 1307 1307 - r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); 1308 - /* Ensure no bio records IO_BLOCKED */ 1309 - memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); 1308 + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 1309 + memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2])); 1310 1310 init_r1bio(r1_bio, mddev, bio); 1311 1311 return r1_bio; 1312 1312 } ··· 2745 2747 BUG_ON(mempool_initialized(&conf->r1buf_pool)); 2746 2748 2747 2749 return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, 2748 - r1buf_pool_free, conf->poolinfo); 2750 + r1buf_pool_free, conf); 2749 2751 } 2750 2752 2751 2753 static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) ··· 2755 2757 struct bio *bio; 2756 2758 int i; 2757 2759 2758 - for (i = conf->poolinfo->raid_disks; i--; ) { 2760 + for (i = conf->raid_disks * 2; i--; ) { 2759 2761 bio = r1bio->bios[i]; 2760 2762 rps = bio->bi_private; 2761 2763 bio_reset(bio, NULL, 0); ··· 2820 2822 } 2821 2823 2822 2824 if (mddev->bitmap == NULL && 2823 - mddev->recovery_cp == MaxSector && 2825 + mddev->resync_offset == MaxSector && 2824 2826 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 2825 2827 conf->fullsync == 0) { 2826 2828 *skipped = 1; ··· 3083 3085 int i; 3084 3086 struct raid1_info *disk; 3085 3087 struct md_rdev *rdev; 3088 + size_t r1bio_size; 3086 3089 int err = -ENOMEM; 3087 3090 3088 3091 conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); ··· 3120 3121 if (!conf->tmppage) 3121 3122 goto abort; 3122 3123 3123 - conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 3124 - if (!conf->poolinfo) 3125 - goto abort; 3126 - conf->poolinfo->raid_disks = mddev->raid_disks * 2; 3127 - err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, 3128 - rbio_pool_free, conf->poolinfo); 3129 - if (err) 3124 + r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]); 3125 + conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size); 3126 + if (!conf->r1bio_pool) 3130 3127 goto abort; 3131 3128 3132 3129 err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 3133 3130 if (err) 3134 3131 goto abort; 3135 - 3136 - conf->poolinfo->mddev = mddev; 3137 3132 3138 3133 err = -EINVAL; 3139 3134 spin_lock_init(&conf->device_lock); ··· 3191 3198 3192 3199 abort: 3193 3200 if (conf) { 3194 - mempool_exit(&conf->r1bio_pool); 3201 + mempool_destroy(conf->r1bio_pool); 3195 3202 kfree(conf->mirrors); 3196 3203 safe_put_page(conf->tmppage); 3197 - kfree(conf->poolinfo); 3198 3204 kfree(conf->nr_pending); 3199 3205 kfree(conf->nr_waiting); 3200 3206 kfree(conf->nr_queued); ··· 3274 3282 } 3275 3283 3276 3284 if (conf->raid_disks - mddev->degraded == 1) 3277 - mddev->recovery_cp = MaxSector; 3285 + mddev->resync_offset = MaxSector; 3278 3286 3279 - if (mddev->recovery_cp != MaxSector) 3287 + if (mddev->resync_offset != MaxSector) 3280 3288 pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", 3281 3289 mdname(mddev)); 3282 3290 pr_info("md/raid1:%s: active with %d out of %d mirrors\n", ··· 3303 3311 { 3304 3312 struct r1conf *conf = priv; 3305 3313 3306 - mempool_exit(&conf->r1bio_pool); 3314 + mempool_destroy(conf->r1bio_pool); 3307 3315 kfree(conf->mirrors); 3308 3316 safe_put_page(conf->tmppage); 3309 - kfree(conf->poolinfo); 3310 3317 kfree(conf->nr_pending); 3311 3318 kfree(conf->nr_waiting); 3312 3319 kfree(conf->nr_queued); ··· 3336 3345 3337 3346 md_set_array_sectors(mddev, newsize); 3338 3347 if (sectors > mddev->dev_sectors && 3339 - mddev->recovery_cp > mddev->dev_sectors) { 3340 - mddev->recovery_cp = mddev->dev_sectors; 3348 + mddev->resync_offset > mddev->dev_sectors) { 3349 + mddev->resync_offset = mddev->dev_sectors; 3341 3350 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3342 3351 } 3343 3352 mddev->dev_sectors = sectors; ··· 3358 3367 * At the same time, we "pack" the devices so that all the missing 3359 3368 * devices have the higher raid_disk numbers. 3360 3369 */ 3361 - mempool_t newpool, oldpool; 3362 - struct pool_info *newpoolinfo; 3370 + mempool_t *newpool, *oldpool; 3371 + size_t new_r1bio_size; 3363 3372 struct raid1_info *newmirrors; 3364 3373 struct r1conf *conf = mddev->private; 3365 3374 int cnt, raid_disks; 3366 3375 unsigned long flags; 3367 3376 int d, d2; 3368 - int ret; 3369 - 3370 - memset(&newpool, 0, sizeof(newpool)); 3371 - memset(&oldpool, 0, sizeof(oldpool)); 3372 3377 3373 3378 /* Cannot change chunk_size, layout, or level */ 3374 3379 if (mddev->chunk_sectors != mddev->new_chunk_sectors || ··· 3390 3403 return -EBUSY; 3391 3404 } 3392 3405 3393 - newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 3394 - if (!newpoolinfo) 3406 + new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]); 3407 + newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size); 3408 + if (!newpool) { 3395 3409 return -ENOMEM; 3396 - newpoolinfo->mddev = mddev; 3397 - newpoolinfo->raid_disks = raid_disks * 2; 3398 - 3399 - ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, 3400 - rbio_pool_free, newpoolinfo); 3401 - if (ret) { 3402 - kfree(newpoolinfo); 3403 - return ret; 3404 3410 } 3405 3411 newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), 3406 3412 raid_disks, 2), 3407 3413 GFP_KERNEL); 3408 3414 if (!newmirrors) { 3409 - kfree(newpoolinfo); 3410 - mempool_exit(&newpool); 3415 + mempool_destroy(newpool); 3411 3416 return -ENOMEM; 3412 3417 } 3413 3418 ··· 3408 3429 /* ok, everything is stopped */ 3409 3430 oldpool = conf->r1bio_pool; 3410 3431 conf->r1bio_pool = newpool; 3411 - init_waitqueue_head(&conf->r1bio_pool.wait); 3412 3432 3413 3433 for (d = d2 = 0; d < conf->raid_disks; d++) { 3414 3434 struct md_rdev *rdev = conf->mirrors[d].rdev; ··· 3424 3446 } 3425 3447 kfree(conf->mirrors); 3426 3448 conf->mirrors = newmirrors; 3427 - kfree(conf->poolinfo); 3428 - conf->poolinfo = newpoolinfo; 3429 3449 3430 3450 spin_lock_irqsave(&conf->device_lock, flags); 3431 3451 mddev->degraded += (raid_disks - conf->raid_disks); ··· 3437 3461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3438 3462 md_wakeup_thread(mddev->thread); 3439 3463 3440 - mempool_exit(&oldpool); 3464 + mempool_destroy(oldpool); 3441 3465 return 0; 3442 3466 } 3443 3467

+1 -21

drivers/md/raid1.h

··· 49 49 sector_t seq_start; 50 50 }; 51 51 52 - /* 53 - * memory pools need a pointer to the mddev, so they can force an unplug 54 - * when memory is tight, and a count of the number of drives that the 55 - * pool was allocated for, so they know how much to allocate and free. 56 - * mddev->raid_disks cannot be used, as it can change while a pool is active 57 - * These two datums are stored in a kmalloced struct. 58 - * The 'raid_disks' here is twice the raid_disks in r1conf. 59 - * This allows space for each 'real' device can have a replacement in the 60 - * second half of the array. 61 - */ 62 - 63 - struct pool_info { 64 - struct mddev *mddev; 65 - int raid_disks; 66 - }; 67 - 68 52 struct r1conf { 69 53 struct mddev *mddev; 70 54 struct raid1_info *mirrors; /* twice 'raid_disks' to ··· 98 114 */ 99 115 int recovery_disabled; 100 116 101 - /* poolinfo contains information about the content of the 102 - * mempools - it changes when the array grows or shrinks 103 - */ 104 - struct pool_info *poolinfo; 105 - mempool_t r1bio_pool; 117 + mempool_t *r1bio_pool; 106 118 mempool_t r1buf_pool; 107 119 108 120 struct bio_set bio_split;

+8 -8

drivers/md/raid10.c

··· 2117 2117 int last = conf->geo.raid_disks - 1; 2118 2118 struct raid10_info *p; 2119 2119 2120 - if (mddev->recovery_cp < MaxSector) 2120 + if (mddev->resync_offset < MaxSector) 2121 2121 /* only hot-add to in-sync arrays, as recovery is 2122 2122 * very different from resync 2123 2123 */ ··· 3185 3185 * of a clean array, like RAID1 does. 3186 3186 */ 3187 3187 if (mddev->bitmap == NULL && 3188 - mddev->recovery_cp == MaxSector && 3188 + mddev->resync_offset == MaxSector && 3189 3189 mddev->reshape_position == MaxSector && 3190 3190 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 3191 3191 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && ··· 4145 4145 disk->recovery_disabled = mddev->recovery_disabled - 1; 4146 4146 } 4147 4147 4148 - if (mddev->recovery_cp != MaxSector) 4148 + if (mddev->resync_offset != MaxSector) 4149 4149 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", 4150 4150 mdname(mddev)); 4151 4151 pr_info("md/raid10:%s: active with %d out of %d devices\n", ··· 4245 4245 4246 4246 md_set_array_sectors(mddev, size); 4247 4247 if (sectors > mddev->dev_sectors && 4248 - mddev->recovery_cp > oldsize) { 4249 - mddev->recovery_cp = oldsize; 4248 + mddev->resync_offset > oldsize) { 4249 + mddev->resync_offset = oldsize; 4250 4250 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4251 4251 } 4252 4252 calc_sectors(conf, sectors); ··· 4275 4275 mddev->delta_disks = mddev->raid_disks; 4276 4276 mddev->raid_disks *= 2; 4277 4277 /* make sure it will be not marked as dirty */ 4278 - mddev->recovery_cp = MaxSector; 4278 + mddev->resync_offset = MaxSector; 4279 4279 mddev->dev_sectors = size; 4280 4280 4281 4281 conf = setup_conf(mddev); ··· 5087 5087 return; 5088 5088 5089 5089 if (mddev->delta_disks > 0) { 5090 - if (mddev->recovery_cp > mddev->resync_max_sectors) { 5091 - mddev->recovery_cp = mddev->resync_max_sectors; 5090 + if (mddev->resync_offset > mddev->resync_max_sectors) { 5091 + mddev->resync_offset = mddev->resync_max_sectors; 5092 5092 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5093 5093 } 5094 5094 mddev->resync_max_sectors = mddev->array_sectors;

+3 -3

drivers/md/raid5-ppl.c

··· 1163 1163 le64_to_cpu(pplhdr->generation)); 1164 1164 1165 1165 /* attempt to recover from log if we are starting a dirty array */ 1166 - if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) 1166 + if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector) 1167 1167 ret = ppl_recover(log, pplhdr, pplhdr_offset); 1168 1168 1169 1169 /* write empty header if we are starting the array */ ··· 1422 1422 1423 1423 if (ret) { 1424 1424 goto err; 1425 - } else if (!mddev->pers && mddev->recovery_cp == 0 && 1425 + } else if (!mddev->pers && mddev->resync_offset == 0 && 1426 1426 ppl_conf->recovered_entries > 0 && 1427 1427 ppl_conf->mismatch_count == 0) { 1428 1428 /* 1429 1429 * If we are starting a dirty array and the recovery succeeds 1430 1430 * without any issues, set the array as clean. 1431 1431 */ 1432 - mddev->recovery_cp = MaxSector; 1432 + mddev->resync_offset = MaxSector; 1433 1433 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 1434 1434 } else if (mddev->pers && ppl_conf->mismatch_count > 0) { 1435 1435 /* no mismatch allowed when enabling PPL for a running array */

+15 -15

drivers/md/raid5.c

··· 3740 3740 && !test_bit(Faulty, &rdev->flags) 3741 3741 && !test_bit(In_sync, &rdev->flags) 3742 3742 && (rdev->recovery_offset <= sh->sector 3743 - || rdev->mddev->recovery_cp <= sh->sector)) 3743 + || rdev->mddev->resync_offset <= sh->sector)) 3744 3744 rv = 1; 3745 3745 return rv; 3746 3746 } ··· 3832 3832 * is missing/faulty, then we need to read everything we can. 3833 3833 */ 3834 3834 if (!force_rcw && 3835 - sh->sector < sh->raid_conf->mddev->recovery_cp) 3835 + sh->sector < sh->raid_conf->mddev->resync_offset) 3836 3836 /* reconstruct-write isn't being forced */ 3837 3837 return 0; 3838 3838 for (i = 0; i < s->failed && i < 2; i++) { ··· 4097 4097 int disks) 4098 4098 { 4099 4099 int rmw = 0, rcw = 0, i; 4100 - sector_t recovery_cp = conf->mddev->recovery_cp; 4100 + sector_t resync_offset = conf->mddev->resync_offset; 4101 4101 4102 4102 /* Check whether resync is now happening or should start. 4103 4103 * If yes, then the array is dirty (after unclean shutdown or ··· 4107 4107 * generate correct data from the parity. 4108 4108 */ 4109 4109 if (conf->rmw_level == PARITY_DISABLE_RMW || 4110 - (recovery_cp < MaxSector && sh->sector >= recovery_cp && 4110 + (resync_offset < MaxSector && sh->sector >= resync_offset && 4111 4111 s->failed == 0)) { 4112 4112 /* Calculate the real rcw later - for now make it 4113 4113 * look like rcw is cheaper 4114 4114 */ 4115 4115 rcw = 1; rmw = 2; 4116 - pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 4117 - conf->rmw_level, (unsigned long long)recovery_cp, 4116 + pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", 4117 + conf->rmw_level, (unsigned long long)resync_offset, 4118 4118 (unsigned long long)sh->sector); 4119 4119 } else for (i = disks; i--; ) { 4120 4120 /* would I have to read this buffer for read_modify_write */ ··· 4770 4770 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4771 4771 /* If there is a failed device being replaced, 4772 4772 * we must be recovering. 4773 - * else if we are after recovery_cp, we must be syncing 4773 + * else if we are after resync_offset, we must be syncing 4774 4774 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4775 4775 * else we can only be replacing 4776 4776 * sync and recovery both need to read all devices, and so 4777 4777 * use the same flag. 4778 4778 */ 4779 4779 if (do_recovery || 4780 - sh->sector >= conf->mddev->recovery_cp || 4780 + sh->sector >= conf->mddev->resync_offset || 4781 4781 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4782 4782 s->syncing = 1; 4783 4783 else ··· 7780 7780 int first = 1; 7781 7781 int ret = -EIO; 7782 7782 7783 - if (mddev->recovery_cp != MaxSector) 7783 + if (mddev->resync_offset != MaxSector) 7784 7784 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7785 7785 mdname(mddev)); 7786 7786 ··· 7921 7921 mdname(mddev)); 7922 7922 mddev->ro = 1; 7923 7923 set_disk_ro(mddev->gendisk, 1); 7924 - } else if (mddev->recovery_cp == MaxSector) 7924 + } else if (mddev->resync_offset == MaxSector) 7925 7925 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7926 7926 } 7927 7927 ··· 7988 7988 mddev->resync_max_sectors = mddev->dev_sectors; 7989 7989 7990 7990 if (mddev->degraded > dirty_parity_disks && 7991 - mddev->recovery_cp != MaxSector) { 7991 + mddev->resync_offset != MaxSector) { 7992 7992 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7993 7993 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7994 7994 mdname(mddev)); ··· 8328 8328 8329 8329 md_set_array_sectors(mddev, newsize); 8330 8330 if (sectors > mddev->dev_sectors && 8331 - mddev->recovery_cp > mddev->dev_sectors) { 8332 - mddev->recovery_cp = mddev->dev_sectors; 8331 + mddev->resync_offset > mddev->dev_sectors) { 8332 + mddev->resync_offset = mddev->dev_sectors; 8333 8333 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8334 8334 } 8335 8335 mddev->dev_sectors = sectors; ··· 8423 8423 return -EINVAL; 8424 8424 8425 8425 /* raid5 can't handle concurrent reshape and recovery */ 8426 - if (mddev->recovery_cp < MaxSector) 8426 + if (mddev->resync_offset < MaxSector) 8427 8427 return -EBUSY; 8428 8428 for (i = 0; i < conf->raid_disks; i++) 8429 8429 if (conf->disks[i].replacement) ··· 8648 8648 mddev->raid_disks += 1; 8649 8649 mddev->delta_disks = 1; 8650 8650 /* make sure it will be not marked as dirty */ 8651 - mddev->recovery_cp = MaxSector; 8651 + mddev->resync_offset = MaxSector; 8652 8652 8653 8653 return setup_conf(mddev); 8654 8654 }

+2 -2

drivers/nvme/host/auth.c

··· 742 742 "%s: qid %d failed to generate digest, error %d\n", 743 743 __func__, chap->qid, ret); 744 744 goto out_free_psk; 745 - }; 745 + } 746 746 dev_dbg(ctrl->device, "%s: generated digest %s\n", 747 747 __func__, digest); 748 748 ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len, ··· 752 752 "%s: qid %d failed to derive TLS psk, error %d\n", 753 753 __func__, chap->qid, ret); 754 754 goto out_free_digest; 755 - }; 755 + } 756 756 757 757 tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring, 758 758 ctrl->opts->host->nqn,

+16

drivers/nvme/host/core.c

··· 3158 3158 return ctrl->opts && ctrl->opts->discovery_nqn; 3159 3159 } 3160 3160 3161 + static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl) 3162 + { 3163 + return ctrl->cntrltype == NVME_CTRL_ADMIN; 3164 + } 3165 + 3161 3166 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, 3162 3167 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 3163 3168 { ··· 3674 3669 ret = nvme_init_identify(ctrl); 3675 3670 if (ret) 3676 3671 return ret; 3672 + 3673 + if (nvme_admin_ctrl(ctrl)) { 3674 + /* 3675 + * An admin controller has one admin queue, but no I/O queues. 3676 + * Override queue_count so it only creates an admin queue. 3677 + */ 3678 + dev_dbg(ctrl->device, 3679 + "Subsystem %s is an administrative controller", 3680 + ctrl->subsys->subnqn); 3681 + ctrl->queue_count = 1; 3682 + } 3677 3683 3678 3684 ret = nvme_configure_apst(ctrl); 3679 3685 if (ret < 0)

+2 -2

drivers/nvme/host/fc.c

··· 1363 1363 * down, and the related FC-NVME Association ID and Connection IDs 1364 1364 * become invalid. 1365 1365 * 1366 - * The behavior of the fc-nvme initiator is such that it's 1366 + * The behavior of the fc-nvme initiator is such that its 1367 1367 * understanding of the association and connections will implicitly 1368 1368 * be torn down. The action is implicit as it may be due to a loss of 1369 1369 * connectivity with the fc-nvme target, so you may never get a ··· 2777 2777 * as WRITE ZEROES will return a non-zero rq payload_bytes yet 2778 2778 * there is no actual payload to be transferred. 2779 2779 * To get it right, key data transmission on there being 1 or 2780 - * more physical segments in the sg list. If there is no 2780 + * more physical segments in the sg list. If there are no 2781 2781 * physical segments, there is no payload. 2782 2782 */ 2783 2783 if (blk_rq_nr_phys_segments(rq)) {

+1 -1

drivers/nvme/host/pci.c

··· 935 935 936 936 nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); 937 937 if (unlikely(iter->status)) 938 - nvme_free_sgls(req); 938 + nvme_unmap_data(req); 939 939 return iter->status; 940 940 } 941 941

+1 -1

drivers/nvme/host/tcp.c

··· 2179 2179 2180 2180 /* 2181 2181 * Only start IO queues for which we have allocated the tagset 2182 - * and limitted it to the available queues. On reconnects, the 2182 + * and limited it to the available queues. On reconnects, the 2183 2183 * queue number might have changed. 2184 2184 */ 2185 2185 nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);

+9 -9

drivers/nvme/target/core.c

··· 1960 1960 if (!nvmet_wq) 1961 1961 goto out_free_buffered_work_queue; 1962 1962 1963 - error = nvmet_init_discovery(); 1963 + error = nvmet_init_debugfs(); 1964 1964 if (error) 1965 1965 goto out_free_nvmet_work_queue; 1966 1966 1967 - error = nvmet_init_debugfs(); 1968 - if (error) 1969 - goto out_exit_discovery; 1970 - 1971 - error = nvmet_init_configfs(); 1967 + error = nvmet_init_discovery(); 1972 1968 if (error) 1973 1969 goto out_exit_debugfs; 1974 1970 1971 + error = nvmet_init_configfs(); 1972 + if (error) 1973 + goto out_exit_discovery; 1974 + 1975 1975 return 0; 1976 1976 1977 - out_exit_debugfs: 1978 - nvmet_exit_debugfs(); 1979 1977 out_exit_discovery: 1980 1978 nvmet_exit_discovery(); 1979 + out_exit_debugfs: 1980 + nvmet_exit_debugfs(); 1981 1981 out_free_nvmet_work_queue: 1982 1982 destroy_workqueue(nvmet_wq); 1983 1983 out_free_buffered_work_queue: ··· 1992 1992 static void __exit nvmet_exit(void) 1993 1993 { 1994 1994 nvmet_exit_configfs(); 1995 - nvmet_exit_debugfs(); 1996 1995 nvmet_exit_discovery(); 1996 + nvmet_exit_debugfs(); 1997 1997 ida_destroy(&cntlid_ida); 1998 1998 destroy_workqueue(nvmet_wq); 1999 1999 destroy_workqueue(buffered_io_wq);

+3 -3

drivers/nvme/target/fc.c

··· 459 459 * down, and the related FC-NVME Association ID and Connection IDs 460 460 * become invalid. 461 461 * 462 - * The behavior of the fc-nvme target is such that it's 462 + * The behavior of the fc-nvme target is such that its 463 463 * understanding of the association and connections will implicitly 464 464 * be torn down. The action is implicit as it may be due to a loss of 465 465 * connectivity with the fc-nvme host, so the target may never get a ··· 2313 2313 ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq); 2314 2314 if (ret) { 2315 2315 /* 2316 - * should be ok to set w/o lock as its in the thread of 2316 + * should be ok to set w/o lock as it's in the thread of 2317 2317 * execution (not an async timer routine) and doesn't 2318 2318 * contend with any clearing action 2319 2319 */ ··· 2629 2629 * and the api of the FC LLDD which may issue a hw command to send the 2630 2630 * response, but the LLDD may not get the hw completion for that command 2631 2631 * and upcall the nvmet_fc layer before a new command may be 2632 - * asynchronously received - its possible for a command to be received 2632 + * asynchronously received - it's possible for a command to be received 2633 2633 * before the LLDD and nvmet_fc have recycled the job structure. It gives 2634 2634 * the appearance of more commands received than fits in the sq. 2635 2635 * To alleviate this scenario, a temporary queue is maintained in the

+2

drivers/nvme/target/passthru.c

··· 533 533 case NVME_FEAT_HOST_ID: 534 534 req->execute = nvmet_execute_get_features; 535 535 return NVME_SC_SUCCESS; 536 + case NVME_FEAT_FDP: 537 + return nvmet_setup_passthru_command(req); 536 538 default: 537 539 return nvmet_passthru_get_set_features(req); 538 540 }

+3 -3

drivers/nvme/target/rdma.c

··· 1731 1731 * We registered an ib_client to handle device removal for queues, 1732 1732 * so we only need to handle the listening port cm_ids. In this case 1733 1733 * we nullify the priv to prevent double cm_id destruction and destroying 1734 - * the cm_id implicitely by returning a non-zero rc to the callout. 1734 + * the cm_id implicitly by returning a non-zero rc to the callout. 1735 1735 */ 1736 1736 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1737 1737 struct nvmet_rdma_queue *queue) ··· 1742 1742 /* 1743 1743 * This is a queue cm_id. we have registered 1744 1744 * an ib_client to handle queues removal 1745 - * so don't interfear and just return. 1745 + * so don't interfere and just return. 1746 1746 */ 1747 1747 return 0; 1748 1748 } ··· 1760 1760 1761 1761 /* 1762 1762 * We need to return 1 so that the core will destroy 1763 - * it's own ID. What a great API design.. 1763 + * its own ID. What a great API design.. 1764 1764 */ 1765 1765 return 1; 1766 1766 }

+2 -1

include/linux/ioprio.h

··· 60 60 int prio; 61 61 62 62 if (!ioc) 63 - return IOPRIO_DEFAULT; 63 + return IOPRIO_PRIO_VALUE(task_nice_ioclass(p), 64 + task_nice_ioprio(p)); 64 65 65 66 if (p != current) 66 67 lockdep_assert_held(&p->alloc_lock);

+1 -18

include/linux/sbitmap.h

··· 210 210 int sbitmap_get(struct sbitmap *sb); 211 211 212 212 /** 213 - * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, 214 - * limiting the depth used from each word. 215 - * @sb: Bitmap to allocate from. 216 - * @shallow_depth: The maximum number of bits to allocate from a single word. 217 - * 218 - * This rather specific operation allows for having multiple users with 219 - * different allocation limits. E.g., there can be a high-priority class that 220 - * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() 221 - * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority 222 - * class can only allocate half of the total bits in the bitmap, preventing it 223 - * from starving out the high-priority class. 224 - * 225 - * Return: Non-negative allocated bit number if successful, -1 otherwise. 226 - */ 227 - int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth); 228 - 229 - /** 230 213 * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. 231 214 * @sb: Bitmap to check. 232 215 * ··· 461 478 * sbitmap_queue, limiting the depth used from each word, with preemption 462 479 * already disabled. 463 480 * @sbq: Bitmap queue to allocate from. 464 - * @shallow_depth: The maximum number of bits to allocate from a single word. 481 + * @shallow_depth: The maximum number of bits to allocate from the queue. 465 482 * See sbitmap_get_shallow(). 466 483 * 467 484 * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after

+1 -1

include/uapi/linux/raid/md_p.h

··· 173 173 #else 174 174 #error unspecified endianness 175 175 #endif 176 - __u32 recovery_cp; /* 11 recovery checkpoint sector count */ 176 + __u32 resync_offset; /* 11 resync checkpoint sector count */ 177 177 /* There are only valid for minor_version > 90 */ 178 178 __u64 reshape_position; /* 12,13 next address in array-space for reshape */ 179 179 __u32 new_level; /* 14 new level we are reshaping to */

+45 -29

lib/sbitmap.c

··· 208 208 return nr; 209 209 } 210 210 211 + static unsigned int __map_depth_with_shallow(const struct sbitmap *sb, 212 + int index, 213 + unsigned int shallow_depth) 214 + { 215 + u64 shallow_word_depth; 216 + unsigned int word_depth, reminder; 217 + 218 + word_depth = __map_depth(sb, index); 219 + if (shallow_depth >= sb->depth) 220 + return word_depth; 221 + 222 + shallow_word_depth = word_depth * shallow_depth; 223 + reminder = do_div(shallow_word_depth, sb->depth); 224 + 225 + if (reminder >= (index + 1) * word_depth) 226 + shallow_word_depth++; 227 + 228 + return (unsigned int)shallow_word_depth; 229 + } 230 + 211 231 static int sbitmap_find_bit(struct sbitmap *sb, 212 - unsigned int depth, 232 + unsigned int shallow_depth, 213 233 unsigned int index, 214 234 unsigned int alloc_hint, 215 235 bool wrap) ··· 238 218 int nr = -1; 239 219 240 220 for (i = 0; i < sb->map_nr; i++) { 241 - nr = sbitmap_find_bit_in_word(&sb->map[index], 242 - min_t(unsigned int, 243 - __map_depth(sb, index), 244 - depth), 245 - alloc_hint, wrap); 221 + unsigned int depth = __map_depth_with_shallow(sb, index, 222 + shallow_depth); 246 223 224 + if (depth) 225 + nr = sbitmap_find_bit_in_word(&sb->map[index], depth, 226 + alloc_hint, wrap); 247 227 if (nr != -1) { 248 228 nr += index << sb->shift; 249 229 break; ··· 307 287 return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); 308 288 } 309 289 310 - int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) 290 + /** 291 + * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, 292 + * limiting the depth used from each word. 293 + * @sb: Bitmap to allocate from. 294 + * @shallow_depth: The maximum number of bits to allocate from the bitmap. 295 + * 296 + * This rather specific operation allows for having multiple users with 297 + * different allocation limits. E.g., there can be a high-priority class that 298 + * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() 299 + * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority 300 + * class can only allocate half of the total bits in the bitmap, preventing it 301 + * from starving out the high-priority class. 302 + * 303 + * Return: Non-negative allocated bit number if successful, -1 otherwise. 304 + */ 305 + static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) 311 306 { 312 307 int nr; 313 308 unsigned int hint, depth; ··· 337 302 338 303 return nr; 339 304 } 340 - EXPORT_SYMBOL_GPL(sbitmap_get_shallow); 341 305 342 306 bool sbitmap_any_bit_set(const struct sbitmap *sb) 343 307 { ··· 440 406 static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, 441 407 unsigned int depth) 442 408 { 443 - unsigned int wake_batch; 444 - unsigned int shallow_depth; 445 - 446 - /* 447 - * Each full word of the bitmap has bits_per_word bits, and there might 448 - * be a partial word. There are depth / bits_per_word full words and 449 - * depth % bits_per_word bits left over. In bitwise arithmetic: 450 - * 451 - * bits_per_word = 1 << shift 452 - * depth / bits_per_word = depth >> shift 453 - * depth % bits_per_word = depth & ((1 << shift) - 1) 454 - * 455 - * Each word can be limited to sbq->min_shallow_depth bits. 456 - */ 457 - shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); 458 - depth = ((depth >> sbq->sb.shift) * shallow_depth + 459 - min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); 460 - wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, 461 - SBQ_WAKE_BATCH); 462 - 463 - return wake_batch; 409 + return clamp_t(unsigned int, 410 + min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES, 411 + 1, SBQ_WAKE_BATCH); 464 412 } 465 413 466 414 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,

Configure Feed

Configure Feed