Merge tag 'md-3.7' of git://neil.brown.name/md

+9

Documentation/device-mapper/dm-raid.txt

··· 132 132 which are 'A'live, and the array is 2/490221568 complete with recovery. 133 133 Faulty or missing devices are marked 'D'. Devices that are out-of-sync 134 134 are marked 'a'. 135 + 136 + 137 + Version History 138 + --------------- 139 + 1.0.0 Initial version. Support for RAID 4/5/6 140 + 1.1.0 Added support for RAID 1 141 + 1.2.0 Handle creation of arrays that contain failed devices. 142 + 1.3.0 Added support for RAID 10 143 + 1.3.1 Allow device replacement/rebuild for RAID 10

+2 -2

crypto/xor.c

··· 56 56 EXPORT_SYMBOL(xor_blocks); 57 57 58 58 /* Set of all registered templates. */ 59 - static struct xor_block_template *template_list; 59 + static struct xor_block_template *__initdata template_list; 60 60 61 61 #define BENCH_SIZE (PAGE_SIZE) 62 62 63 - static void 63 + static void __init 64 64 do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) 65 65 { 66 66 int speed;

+5 -12

drivers/md/bitmap.c

··· 163 163 * As devices are only added or removed when raid_disk is < 0 and 164 164 * nr_pending is 0 and In_sync is clear, the entries we return will 165 165 * still be in the same position on the list when we re-enter 166 - * list_for_each_continue_rcu. 166 + * list_for_each_entry_continue_rcu. 167 167 */ 168 - struct list_head *pos; 169 168 rcu_read_lock(); 170 169 if (rdev == NULL) 171 170 /* start at the beginning */ 172 - pos = &mddev->disks; 171 + rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); 173 172 else { 174 173 /* release the previous rdev and start from there. */ 175 174 rdev_dec_pending(rdev, mddev); 176 - pos = &rdev->same_set; 177 175 } 178 - list_for_each_continue_rcu(pos, &mddev->disks) { 179 - rdev = list_entry(pos, struct md_rdev, same_set); 176 + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { 180 177 if (rdev->raid_disk >= 0 && 181 178 !test_bit(Faulty, &rdev->flags)) { 182 179 /* this is a usable devices */ ··· 470 473 { 471 474 bitmap_super_t *sb; 472 475 unsigned long chunksize, daemon_sleep, write_behind; 473 - int err = -EINVAL; 474 476 475 477 bitmap->storage.sb_page = alloc_page(GFP_KERNEL); 476 - if (IS_ERR(bitmap->storage.sb_page)) { 477 - err = PTR_ERR(bitmap->storage.sb_page); 478 - bitmap->storage.sb_page = NULL; 479 - return err; 480 - } 478 + if (bitmap->storage.sb_page == NULL) 479 + return -ENOMEM; 481 480 bitmap->storage.sb_page->index = 0; 482 481 483 482 sb = kmap_atomic(bitmap->storage.sb_page);

+97 -27

drivers/md/dm-raid.c

··· 338 338 } 339 339 340 340 /* 341 + * validate_rebuild_devices 342 + * @rs 343 + * 344 + * Determine if the devices specified for rebuild can result in a valid 345 + * usable array that is capable of rebuilding the given devices. 346 + * 347 + * Returns: 0 on success, -EINVAL on failure. 348 + */ 349 + static int validate_rebuild_devices(struct raid_set *rs) 350 + { 351 + unsigned i, rebuild_cnt = 0; 352 + unsigned rebuilds_per_group, copies, d; 353 + 354 + if (!(rs->print_flags & DMPF_REBUILD)) 355 + return 0; 356 + 357 + for (i = 0; i < rs->md.raid_disks; i++) 358 + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 359 + rebuild_cnt++; 360 + 361 + switch (rs->raid_type->level) { 362 + case 1: 363 + if (rebuild_cnt >= rs->md.raid_disks) 364 + goto too_many; 365 + break; 366 + case 4: 367 + case 5: 368 + case 6: 369 + if (rebuild_cnt > rs->raid_type->parity_devs) 370 + goto too_many; 371 + break; 372 + case 10: 373 + copies = raid10_md_layout_to_copies(rs->md.layout); 374 + if (rebuild_cnt < copies) 375 + break; 376 + 377 + /* 378 + * It is possible to have a higher rebuild count for RAID10, 379 + * as long as the failed devices occur in different mirror 380 + * groups (i.e. different stripes). 381 + * 382 + * Right now, we only allow for "near" copies. When other 383 + * formats are added, we will have to check those too. 384 + * 385 + * When checking "near" format, make sure no adjacent devices 386 + * have failed beyond what can be handled. In addition to the 387 + * simple case where the number of devices is a multiple of the 388 + * number of copies, we must also handle cases where the number 389 + * of devices is not a multiple of the number of copies. 390 + * E.g. dev1 dev2 dev3 dev4 dev5 391 + * A A B B C 392 + * C D D E E 393 + */ 394 + rebuilds_per_group = 0; 395 + for (i = 0; i < rs->md.raid_disks * copies; i++) { 396 + d = i % rs->md.raid_disks; 397 + if (!test_bit(In_sync, &rs->dev[d].rdev.flags) && 398 + (++rebuilds_per_group >= copies)) 399 + goto too_many; 400 + if (!((i + 1) % copies)) 401 + rebuilds_per_group = 0; 402 + } 403 + break; 404 + default: 405 + DMERR("The rebuild parameter is not supported for %s", 406 + rs->raid_type->name); 407 + rs->ti->error = "Rebuild not supported for this RAID type"; 408 + return -EINVAL; 409 + } 410 + 411 + return 0; 412 + 413 + too_many: 414 + rs->ti->error = "Too many rebuild devices specified"; 415 + return -EINVAL; 416 + } 417 + 418 + /* 341 419 * Possible arguments are... 342 420 * <chunk_size> [optional_args] 343 421 * ··· 443 365 { 444 366 char *raid10_format = "near"; 445 367 unsigned raid10_copies = 2; 446 - unsigned i, rebuild_cnt = 0; 368 + unsigned i; 447 369 unsigned long value, region_size = 0; 448 370 sector_t sectors_per_dev = rs->ti->len; 449 371 sector_t max_io_len; ··· 539 461 540 462 /* Parameters that take a numeric value are checked here */ 541 463 if (!strcasecmp(key, "rebuild")) { 542 - rebuild_cnt++; 543 - 544 - switch (rs->raid_type->level) { 545 - case 1: 546 - if (rebuild_cnt >= rs->md.raid_disks) { 547 - rs->ti->error = "Too many rebuild devices specified"; 548 - return -EINVAL; 549 - } 550 - break; 551 - case 4: 552 - case 5: 553 - case 6: 554 - if (rebuild_cnt > rs->raid_type->parity_devs) { 555 - rs->ti->error = "Too many rebuild devices specified for given RAID type"; 556 - return -EINVAL; 557 - } 558 - break; 559 - case 10: 560 - default: 561 - DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); 562 - rs->ti->error = "Rebuild not supported for this RAID type"; 563 - return -EINVAL; 564 - } 565 - 566 - if (value > rs->md.raid_disks) { 464 + if (value >= rs->md.raid_disks) { 567 465 rs->ti->error = "Invalid rebuild index given"; 568 466 return -EINVAL; 569 467 } ··· 661 607 return -EINVAL; 662 608 } 663 609 rs->md.dev_sectors = sectors_per_dev; 610 + 611 + if (validate_rebuild_devices(rs)) 612 + return -EINVAL; 664 613 665 614 /* Assume there are no metadata devices until the drives are parsed */ 666 615 rs->md.persistent = 0; ··· 1017 960 1018 961 freshest = NULL; 1019 962 rdev_for_each_safe(rdev, tmp, mddev) { 963 + /* 964 + * Skipping super_load due to DMPF_SYNC will cause 965 + * the array to undergo initialization again as 966 + * though it were new. This is the intended effect 967 + * of the "sync" directive. 968 + * 969 + * When reshaping capability is added, we must ensure 970 + * that the "sync" directive is disallowed during the 971 + * reshape. 972 + */ 973 + if (rs->print_flags & DMPF_SYNC) 974 + continue; 975 + 1020 976 if (!rdev->meta_bdev) 1021 977 continue; 1022 978 ··· 1430 1360 1431 1361 static struct target_type raid_target = { 1432 1362 .name = "raid", 1433 - .version = {1, 3, 0}, 1363 + .version = {1, 3, 1}, 1434 1364 .module = THIS_MODULE, 1435 1365 .ctr = raid_ctr, 1436 1366 .dtr = raid_dtr,

+23 -2

drivers/md/linear.c

··· 138 138 struct linear_conf *conf; 139 139 struct md_rdev *rdev; 140 140 int i, cnt; 141 + bool discard_supported = false; 141 142 142 143 conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), 143 144 GFP_KERNEL); ··· 172 171 conf->array_sectors += rdev->sectors; 173 172 cnt++; 174 173 174 + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 175 + discard_supported = true; 175 176 } 176 177 if (cnt != raid_disks) { 177 178 printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", 178 179 mdname(mddev)); 179 180 goto out; 180 181 } 182 + 183 + if (!discard_supported) 184 + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 185 + else 186 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 181 187 182 188 /* 183 189 * Here we calculate the device offsets. ··· 252 244 if (!newconf) 253 245 return -ENOMEM; 254 246 255 - oldconf = rcu_dereference(mddev->private); 247 + oldconf = rcu_dereference_protected(mddev->private, 248 + lockdep_is_held( 249 + &mddev->reconfig_mutex)); 256 250 mddev->raid_disks++; 257 251 rcu_assign_pointer(mddev->private, newconf); 258 252 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); ··· 266 256 267 257 static int linear_stop (struct mddev *mddev) 268 258 { 269 - struct linear_conf *conf = mddev->private; 259 + struct linear_conf *conf = 260 + rcu_dereference_protected(mddev->private, 261 + lockdep_is_held( 262 + &mddev->reconfig_mutex)); 270 263 271 264 /* 272 265 * We do not require rcu protection here since ··· 339 326 bio->bi_sector = bio->bi_sector - start_sector 340 327 + tmp_dev->rdev->data_offset; 341 328 rcu_read_unlock(); 329 + 330 + if (unlikely((bio->bi_rw & REQ_DISCARD) && 331 + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { 332 + /* Just ignore it */ 333 + bio_endio(bio, 0); 334 + return; 335 + } 336 + 342 337 generic_make_request(bio); 343 338 } 344 339

+106 -39

drivers/md/md.c

··· 674 674 return NULL; 675 675 } 676 676 677 - static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) 677 + static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) 678 + { 679 + struct md_rdev *rdev; 680 + 681 + rdev_for_each_rcu(rdev, mddev) 682 + if (rdev->desc_nr == nr) 683 + return rdev; 684 + 685 + return NULL; 686 + } 687 + 688 + static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 678 689 { 679 690 struct md_rdev *rdev; 680 691 681 692 rdev_for_each(rdev, mddev) 693 + if (rdev->bdev->bd_dev == dev) 694 + return rdev; 695 + 696 + return NULL; 697 + } 698 + 699 + static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 700 + { 701 + struct md_rdev *rdev; 702 + 703 + rdev_for_each_rcu(rdev, mddev) 682 704 if (rdev->bdev->bd_dev == dev) 683 705 return rdev; 684 706 ··· 2044 2022 /* Disable data integrity if non-capable/non-matching disk is being added */ 2045 2023 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2046 2024 { 2047 - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 2048 - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); 2025 + struct blk_integrity *bi_rdev; 2026 + struct blk_integrity *bi_mddev; 2027 + 2028 + if (!mddev->gendisk) 2029 + return; 2030 + 2031 + bi_rdev = bdev_get_integrity(rdev->bdev); 2032 + bi_mddev = blk_get_integrity(mddev->gendisk); 2049 2033 2050 2034 if (!bi_mddev) /* nothing to do */ 2051 2035 return; ··· 3782 3754 return -EINVAL; 3783 3755 3784 3756 mddev->recovery_cp = n; 3757 + if (mddev->pers) 3758 + set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3785 3759 return len; 3786 3760 } 3787 3761 static struct md_sysfs_entry md_resync_start = ··· 4261 4231 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4262 4232 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4263 4233 } 4234 + if (mddev->ro == 2) { 4235 + /* A write to sync_action is enough to justify 4236 + * canceling read-auto mode 4237 + */ 4238 + mddev->ro = 0; 4239 + md_wakeup_thread(mddev->sync_thread); 4240 + } 4264 4241 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4265 4242 md_wakeup_thread(mddev->thread); 4266 4243 sysfs_notify_dirent_safe(mddev->sysfs_action); ··· 4278 4241 mismatch_cnt_show(struct mddev *mddev, char *page) 4279 4242 { 4280 4243 return sprintf(page, "%llu\n", 4281 - (unsigned long long) mddev->resync_mismatches); 4244 + (unsigned long long) 4245 + atomic64_read(&mddev->resync_mismatches)); 4282 4246 } 4283 4247 4284 4248 static struct md_sysfs_entry md_scan_mode = ··· 4399 4361 4400 4362 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4401 4363 return sprintf(page, "none\n"); 4364 + 4365 + if (mddev->curr_resync == 1 || 4366 + mddev->curr_resync == 2) 4367 + return sprintf(page, "delayed\n"); 4402 4368 4403 4369 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4404 4370 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) ··· 5249 5207 mddev->new_layout = 0; 5250 5208 mddev->new_chunk_sectors = 0; 5251 5209 mddev->curr_resync = 0; 5252 - mddev->resync_mismatches = 0; 5210 + atomic64_set(&mddev->resync_mismatches, 0); 5253 5211 mddev->suspend_lo = mddev->suspend_hi = 0; 5254 5212 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5255 5213 mddev->recovery = 0; ··· 5551 5509 int nr,working,insync,failed,spare; 5552 5510 struct md_rdev *rdev; 5553 5511 5554 - nr=working=insync=failed=spare=0; 5555 - rdev_for_each(rdev, mddev) { 5512 + nr = working = insync = failed = spare = 0; 5513 + rcu_read_lock(); 5514 + rdev_for_each_rcu(rdev, mddev) { 5556 5515 nr++; 5557 5516 if (test_bit(Faulty, &rdev->flags)) 5558 5517 failed++; ··· 5565 5522 spare++; 5566 5523 } 5567 5524 } 5525 + rcu_read_unlock(); 5568 5526 5569 5527 info.major_version = mddev->major_version; 5570 5528 info.minor_version = mddev->minor_version; ··· 5649 5605 if (copy_from_user(&info, arg, sizeof(info))) 5650 5606 return -EFAULT; 5651 5607 5652 - rdev = find_rdev_nr(mddev, info.number); 5608 + rcu_read_lock(); 5609 + rdev = find_rdev_nr_rcu(mddev, info.number); 5653 5610 if (rdev) { 5654 5611 info.major = MAJOR(rdev->bdev->bd_dev); 5655 5612 info.minor = MINOR(rdev->bdev->bd_dev); ··· 5669 5624 info.raid_disk = -1; 5670 5625 info.state = (1<<MD_DISK_REMOVED); 5671 5626 } 5627 + rcu_read_unlock(); 5672 5628 5673 5629 if (copy_to_user(arg, &info, sizeof(info))) 5674 5630 return -EFAULT; ··· 6278 6232 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6279 6233 { 6280 6234 struct md_rdev *rdev; 6235 + int err = 0; 6281 6236 6282 6237 if (mddev->pers == NULL) 6283 6238 return -ENODEV; 6284 6239 6285 - rdev = find_rdev(mddev, dev); 6240 + rcu_read_lock(); 6241 + rdev = find_rdev_rcu(mddev, dev); 6286 6242 if (!rdev) 6287 - return -ENODEV; 6288 - 6289 - md_error(mddev, rdev); 6290 - if (!test_bit(Faulty, &rdev->flags)) 6291 - return -EBUSY; 6292 - return 0; 6243 + err = -ENODEV; 6244 + else { 6245 + md_error(mddev, rdev); 6246 + if (!test_bit(Faulty, &rdev->flags)) 6247 + err = -EBUSY; 6248 + } 6249 + rcu_read_unlock(); 6250 + return err; 6293 6251 } 6294 6252 6295 6253 /* ··· 6362 6312 6363 6313 if (!mddev) { 6364 6314 BUG(); 6315 + goto abort; 6316 + } 6317 + 6318 + /* Some actions do not requires the mutex */ 6319 + switch (cmd) { 6320 + case GET_ARRAY_INFO: 6321 + if (!mddev->raid_disks && !mddev->external) 6322 + err = -ENODEV; 6323 + else 6324 + err = get_array_info(mddev, argp); 6325 + goto abort; 6326 + 6327 + case GET_DISK_INFO: 6328 + if (!mddev->raid_disks && !mddev->external) 6329 + err = -ENODEV; 6330 + else 6331 + err = get_disk_info(mddev, argp); 6332 + goto abort; 6333 + 6334 + case SET_DISK_FAULTY: 6335 + err = set_disk_faulty(mddev, new_decode_dev(arg)); 6365 6336 goto abort; 6366 6337 } 6367 6338 ··· 6458 6387 */ 6459 6388 switch (cmd) 6460 6389 { 6461 - case GET_ARRAY_INFO: 6462 - err = get_array_info(mddev, argp); 6463 - goto done_unlock; 6464 - 6465 6390 case GET_BITMAP_FILE: 6466 6391 err = get_bitmap_file(mddev, argp); 6467 - goto done_unlock; 6468 - 6469 - case GET_DISK_INFO: 6470 - err = get_disk_info(mddev, argp); 6471 6392 goto done_unlock; 6472 6393 6473 6394 case RESTART_ARRAY_RW: ··· 6541 6478 6542 6479 case HOT_ADD_DISK: 6543 6480 err = hot_add_disk(mddev, new_decode_dev(arg)); 6544 - goto done_unlock; 6545 - 6546 - case SET_DISK_FAULTY: 6547 - err = set_disk_faulty(mddev, new_decode_dev(arg)); 6548 6481 goto done_unlock; 6549 6482 6550 6483 case RUN_ARRAY: ··· 6700 6641 6701 6642 clear_bit(THREAD_WAKEUP, &thread->flags); 6702 6643 if (!kthread_should_stop()) 6703 - thread->run(thread->mddev); 6644 + thread->run(thread); 6704 6645 } 6705 6646 6706 6647 return 0; ··· 6715 6656 } 6716 6657 } 6717 6658 6718 - struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, 6719 - const char *name) 6659 + struct md_thread *md_register_thread(void (*run) (struct md_thread *), 6660 + struct mddev *mddev, const char *name) 6720 6661 { 6721 6662 struct md_thread *thread; 6722 6663 ··· 6811 6752 int scale; 6812 6753 unsigned int per_milli; 6813 6754 6814 - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 6755 + if (mddev->curr_resync <= 3) 6756 + resync = 0; 6757 + else 6758 + resync = mddev->curr_resync 6759 + - atomic_read(&mddev->recovery_active); 6815 6760 6816 6761 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 6817 6762 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) ··· 7041 6978 if (mddev->curr_resync > 2) { 7042 6979 status_resync(seq, mddev); 7043 6980 seq_printf(seq, "\n "); 7044 - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 6981 + } else if (mddev->curr_resync >= 1) 7045 6982 seq_printf(seq, "\tresync=DELAYED\n "); 7046 6983 else if (mddev->recovery_cp < MaxSector) 7047 6984 seq_printf(seq, "\tresync=PENDING\n "); ··· 7269 7206 7270 7207 #define SYNC_MARKS 10 7271 7208 #define SYNC_MARK_STEP (3*HZ) 7272 - void md_do_sync(struct mddev *mddev) 7209 + void md_do_sync(struct md_thread *thread) 7273 7210 { 7211 + struct mddev *mddev = thread->mddev; 7274 7212 struct mddev *mddev2; 7275 7213 unsigned int currspeed = 0, 7276 7214 window; ··· 7375 7311 * which defaults to physical size, but can be virtual size 7376 7312 */ 7377 7313 max_sectors = mddev->resync_max_sectors; 7378 - mddev->resync_mismatches = 0; 7314 + atomic64_set(&mddev->resync_mismatches, 0); 7379 7315 /* we don't use the checkpoint if there's a bitmap */ 7380 7316 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7381 7317 j = mddev->resync_min; ··· 7431 7367 "md: resuming %s of %s from checkpoint.\n", 7432 7368 desc, mdname(mddev)); 7433 7369 mddev->curr_resync = j; 7434 - } 7370 + } else 7371 + mddev->curr_resync = 3; /* no longer delayed */ 7435 7372 mddev->curr_resync_completed = j; 7373 + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7374 + md_new_event(mddev); 7436 7375 7437 7376 blk_start_plug(&plug); 7438 7377 while (j < max_sectors) { ··· 7488 7421 break; 7489 7422 7490 7423 j += sectors; 7491 - if (j>1) mddev->curr_resync = j; 7424 + if (j > 2) 7425 + mddev->curr_resync = j; 7492 7426 mddev->curr_mark_cnt = io_sectors; 7493 7427 if (last_check == 0) 7494 7428 /* this is the earliest that rebuild will be ··· 7610 7542 struct md_rdev *rdev; 7611 7543 int spares = 0; 7612 7544 int removed = 0; 7613 - 7614 - mddev->curr_resync_completed = 0; 7615 7545 7616 7546 rdev_for_each(rdev, mddev) 7617 7547 if (rdev->raid_disk >= 0 && ··· 7805 7739 /* Set RUNNING before clearing NEEDED to avoid 7806 7740 * any transients in the value of "sync_action". 7807 7741 */ 7742 + mddev->curr_resync_completed = 0; 7808 7743 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7809 7744 /* Clear some bits that don't mean anything, but 7810 7745 * might be left set ··· 7819 7752 /* no recovery is running. 7820 7753 * remove any failed drives, then 7821 7754 * add spares if possible. 7822 - * Spare are also removed and re-added, to allow 7755 + * Spares are also removed and re-added, to allow 7823 7756 * the personality to fail the re-add. 7824 7757 */ 7825 7758

+5 -4

drivers/md/md.h

··· 282 282 283 283 sector_t resync_max_sectors; /* may be set by personality */ 284 284 285 - sector_t resync_mismatches; /* count of sectors where 285 + atomic64_t resync_mismatches; /* count of sectors where 286 286 * parity/replica mismatch found 287 287 */ 288 288 ··· 540 540 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) 541 541 542 542 struct md_thread { 543 - void (*run) (struct mddev *mddev); 543 + void (*run) (struct md_thread *thread); 544 544 struct mddev *mddev; 545 545 wait_queue_head_t wqueue; 546 546 unsigned long flags; 547 547 struct task_struct *tsk; 548 548 unsigned long timeout; 549 + void *private; 549 550 }; 550 551 551 552 #define THREAD_WAKEUP 0 ··· 585 584 extern int register_md_personality(struct md_personality *p); 586 585 extern int unregister_md_personality(struct md_personality *p); 587 586 extern struct md_thread *md_register_thread( 588 - void (*run)(struct mddev *mddev), 587 + void (*run)(struct md_thread *thread), 589 588 struct mddev *mddev, 590 589 const char *name); 591 590 extern void md_unregister_thread(struct md_thread **threadp); ··· 604 603 extern void md_super_wait(struct mddev *mddev); 605 604 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 606 605 struct page *page, int rw, bool metadata_op); 607 - extern void md_do_sync(struct mddev *mddev); 606 + extern void md_do_sync(struct md_thread *thread); 608 607 extern void md_new_event(struct mddev *mddev); 609 608 extern int md_allow_write(struct mddev *mddev); 610 609 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);

+2 -1

drivers/md/multipath.c

··· 335 335 * 3. Performs writes following reads for array syncronising. 336 336 */ 337 337 338 - static void multipathd (struct mddev *mddev) 338 + static void multipathd(struct md_thread *thread) 339 339 { 340 + struct mddev *mddev = thread->mddev; 340 341 struct multipath_bh *mp_bh; 341 342 struct bio *bio; 342 343 unsigned long flags;

+18 -1

drivers/md/raid0.c

··· 88 88 char b[BDEVNAME_SIZE]; 89 89 char b2[BDEVNAME_SIZE]; 90 90 struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); 91 + bool discard_supported = false; 91 92 92 93 if (!conf) 93 94 return -ENOMEM; ··· 196 195 if (!smallest || (rdev1->sectors < smallest->sectors)) 197 196 smallest = rdev1; 198 197 cnt++; 198 + 199 + if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) 200 + discard_supported = true; 199 201 } 200 202 if (cnt != mddev->raid_disks) { 201 203 printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " ··· 275 271 blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 276 272 blk_queue_io_opt(mddev->queue, 277 273 (mddev->chunk_sectors << 9) * mddev->raid_disks); 274 + 275 + if (!discard_supported) 276 + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 277 + else 278 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 278 279 279 280 pr_debug("md/raid0:%s: done.\n", mdname(mddev)); 280 281 *private_conf = conf; ··· 432 423 return -EINVAL; 433 424 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 434 425 blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); 426 + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); 435 427 436 428 /* if private is not null, we are here after takeover */ 437 429 if (mddev->private == NULL) { ··· 520 510 sector_t sector = bio->bi_sector; 521 511 struct bio_pair *bp; 522 512 /* Sanity check -- queue functions should prevent this happening */ 523 - if (bio->bi_vcnt != 1 || 513 + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 524 514 bio->bi_idx != 0) 525 515 goto bad_map; 526 516 /* This is a one page bio that upper layers ··· 545 535 bio->bi_bdev = tmp_dev->bdev; 546 536 bio->bi_sector = sector_offset + zone->dev_start + 547 537 tmp_dev->data_offset; 538 + 539 + if (unlikely((bio->bi_rw & REQ_DISCARD) && 540 + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { 541 + /* Just ignore it */ 542 + bio_endio(bio, 0); 543 + return; 544 + } 548 545 549 546 generic_make_request(bio); 550 547 return;

+29 -8

drivers/md/raid1.c

··· 333 333 spin_unlock_irqrestore(&conf->device_lock, flags); 334 334 } 335 335 336 - if (uptodate) 336 + if (uptodate) { 337 337 raid_end_bio_io(r1_bio); 338 - else { 338 + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 339 + } else { 339 340 /* 340 341 * oops, read error: 341 342 */ ··· 350 349 (unsigned long long)r1_bio->sector); 351 350 set_bit(R1BIO_ReadError, &r1_bio->state); 352 351 reschedule_retry(r1_bio); 352 + /* don't drop the reference on read_disk yet */ 353 353 } 354 - 355 - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 356 354 } 357 355 358 356 static void close_write(struct r1bio *r1_bio) ··· 781 781 while (bio) { /* submit pending writes */ 782 782 struct bio *next = bio->bi_next; 783 783 bio->bi_next = NULL; 784 - generic_make_request(bio); 784 + if (unlikely((bio->bi_rw & REQ_DISCARD) && 785 + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 786 + /* Just ignore it */ 787 + bio_endio(bio, 0); 788 + else 789 + generic_make_request(bio); 785 790 bio = next; 786 791 } 787 792 } else ··· 999 994 const int rw = bio_data_dir(bio); 1000 995 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1001 996 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 997 + const unsigned long do_discard = (bio->bi_rw 998 + & (REQ_DISCARD | REQ_SECURE)); 1002 999 struct md_rdev *blocked_rdev; 1003 1000 struct blk_plug_cb *cb; 1004 1001 struct raid1_plug_cb *plug = NULL; ··· 1302 1295 conf->mirrors[i].rdev->data_offset); 1303 1296 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1304 1297 mbio->bi_end_io = raid1_end_write_request; 1305 - mbio->bi_rw = WRITE | do_flush_fua | do_sync; 1298 + mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; 1306 1299 mbio->bi_private = r1_bio; 1307 1300 1308 1301 atomic_inc(&r1_bio->remaining); ··· 1556 1549 clear_bit(Unmerged, &rdev->flags); 1557 1550 } 1558 1551 md_integrity_add_rdev(rdev, mddev); 1552 + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 1553 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 1559 1554 print_conf(conf); 1560 1555 return err; 1561 1556 } ··· 1876 1867 } else 1877 1868 j = 0; 1878 1869 if (j >= 0) 1879 - mddev->resync_mismatches += r1_bio->sectors; 1870 + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); 1880 1871 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) 1881 1872 && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { 1882 1873 /* No need to write to this device. */ ··· 2229 2220 unfreeze_array(conf); 2230 2221 } else 2231 2222 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 2223 + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 2232 2224 2233 2225 bio = r1_bio->bios[r1_bio->read_disk]; 2234 2226 bdevname(bio->bi_bdev, b); ··· 2295 2285 } 2296 2286 } 2297 2287 2298 - static void raid1d(struct mddev *mddev) 2288 + static void raid1d(struct md_thread *thread) 2299 2289 { 2290 + struct mddev *mddev = thread->mddev; 2300 2291 struct r1bio *r1_bio; 2301 2292 unsigned long flags; 2302 2293 struct r1conf *conf = mddev->private; ··· 2794 2783 int i; 2795 2784 struct md_rdev *rdev; 2796 2785 int ret; 2786 + bool discard_supported = false; 2797 2787 2798 2788 if (mddev->level != 1) { 2799 2789 printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", ··· 2824 2812 continue; 2825 2813 disk_stack_limits(mddev->gendisk, rdev->bdev, 2826 2814 rdev->data_offset << 9); 2815 + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 2816 + discard_supported = true; 2827 2817 } 2828 2818 2829 2819 mddev->degraded = 0; ··· 2860 2846 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2861 2847 mddev->queue->backing_dev_info.congested_data = mddev; 2862 2848 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); 2849 + 2850 + if (discard_supported) 2851 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 2852 + mddev->queue); 2853 + else 2854 + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 2855 + mddev->queue); 2863 2856 } 2864 2857 2865 2858 ret = md_integrity_register(mddev);

+84 -11

drivers/md/raid10.c

··· 911 911 while (bio) { /* submit pending writes */ 912 912 struct bio *next = bio->bi_next; 913 913 bio->bi_next = NULL; 914 - generic_make_request(bio); 914 + if (unlikely((bio->bi_rw & REQ_DISCARD) && 915 + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 916 + /* Just ignore it */ 917 + bio_endio(bio, 0); 918 + else 919 + generic_make_request(bio); 915 920 bio = next; 916 921 } 917 922 } else ··· 1055 1050 return rdev->new_data_offset; 1056 1051 } 1057 1052 1053 + struct raid10_plug_cb { 1054 + struct blk_plug_cb cb; 1055 + struct bio_list pending; 1056 + int pending_cnt; 1057 + }; 1058 + 1059 + static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 1060 + { 1061 + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, 1062 + cb); 1063 + struct mddev *mddev = plug->cb.data; 1064 + struct r10conf *conf = mddev->private; 1065 + struct bio *bio; 1066 + 1067 + if (from_schedule) { 1068 + spin_lock_irq(&conf->device_lock); 1069 + bio_list_merge(&conf->pending_bio_list, &plug->pending); 1070 + conf->pending_count += plug->pending_cnt; 1071 + spin_unlock_irq(&conf->device_lock); 1072 + md_wakeup_thread(mddev->thread); 1073 + kfree(plug); 1074 + return; 1075 + } 1076 + 1077 + /* we aren't scheduling, so we can do the write-out directly. */ 1078 + bio = bio_list_get(&plug->pending); 1079 + bitmap_unplug(mddev->bitmap); 1080 + wake_up(&conf->wait_barrier); 1081 + 1082 + while (bio) { /* submit pending writes */ 1083 + struct bio *next = bio->bi_next; 1084 + bio->bi_next = NULL; 1085 + generic_make_request(bio); 1086 + bio = next; 1087 + } 1088 + kfree(plug); 1089 + } 1090 + 1058 1091 static void make_request(struct mddev *mddev, struct bio * bio) 1059 1092 { 1060 1093 struct r10conf *conf = mddev->private; ··· 1104 1061 const int rw = bio_data_dir(bio); 1105 1062 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1106 1063 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1064 + const unsigned long do_discard = (bio->bi_rw 1065 + & (REQ_DISCARD | REQ_SECURE)); 1107 1066 unsigned long flags; 1108 1067 struct md_rdev *blocked_rdev; 1068 + struct blk_plug_cb *cb; 1069 + struct raid10_plug_cb *plug = NULL; 1109 1070 int sectors_handled; 1110 1071 int max_sectors; 1111 1072 int sectors; ··· 1128 1081 || conf->prev.near_copies < conf->prev.raid_disks))) { 1129 1082 struct bio_pair *bp; 1130 1083 /* Sanity check -- queue functions should prevent this happening */ 1131 - if (bio->bi_vcnt != 1 || 1084 + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 1132 1085 bio->bi_idx != 0) 1133 1086 goto bad_map; 1134 1087 /* This is a one page bio that upper layers ··· 1457 1410 conf->mirrors[d].rdev)); 1458 1411 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1459 1412 mbio->bi_end_io = raid10_end_write_request; 1460 - mbio->bi_rw = WRITE | do_sync | do_fua; 1413 + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1461 1414 mbio->bi_private = r10_bio; 1462 1415 1463 1416 atomic_inc(&r10_bio->remaining); 1417 + 1418 + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1419 + if (cb) 1420 + plug = container_of(cb, struct raid10_plug_cb, cb); 1421 + else 1422 + plug = NULL; 1464 1423 spin_lock_irqsave(&conf->device_lock, flags); 1465 - bio_list_add(&conf->pending_bio_list, mbio); 1466 - conf->pending_count++; 1424 + if (plug) { 1425 + bio_list_add(&plug->pending, mbio); 1426 + plug->pending_cnt++; 1427 + } else { 1428 + bio_list_add(&conf->pending_bio_list, mbio); 1429 + conf->pending_count++; 1430 + } 1467 1431 spin_unlock_irqrestore(&conf->device_lock, flags); 1468 - if (!mddev_check_plugged(mddev)) 1432 + if (!plug) 1469 1433 md_wakeup_thread(mddev->thread); 1470 1434 1471 1435 if (!r10_bio->devs[i].repl_bio) ··· 1497 1439 conf->mirrors[d].replacement)); 1498 1440 mbio->bi_bdev = conf->mirrors[d].replacement->bdev; 1499 1441 mbio->bi_end_io = raid10_end_write_request; 1500 - mbio->bi_rw = WRITE | do_sync | do_fua; 1442 + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1501 1443 mbio->bi_private = r10_bio; 1502 1444 1503 1445 atomic_inc(&r10_bio->remaining); ··· 1696 1638 && !test_bit(Faulty, &tmp->rdev->flags) 1697 1639 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 1698 1640 count++; 1699 - sysfs_notify_dirent(tmp->rdev->sysfs_state); 1641 + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 1700 1642 } 1701 1643 } 1702 1644 spin_lock_irqsave(&conf->device_lock, flags); ··· 1783 1725 clear_bit(Unmerged, &rdev->flags); 1784 1726 } 1785 1727 md_integrity_add_rdev(rdev, mddev); 1728 + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 1729 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 1730 + 1786 1731 print_conf(conf); 1787 1732 return err; 1788 1733 } ··· 2013 1952 break; 2014 1953 if (j == vcnt) 2015 1954 continue; 2016 - mddev->resync_mismatches += r10_bio->sectors; 1955 + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 2017 1956 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2018 1957 /* Don't fix anything. */ 2019 1958 continue; ··· 2734 2673 } 2735 2674 } 2736 2675 2737 - static void raid10d(struct mddev *mddev) 2676 + static void raid10d(struct md_thread *thread) 2738 2677 { 2678 + struct mddev *mddev = thread->mddev; 2739 2679 struct r10bio *r10_bio; 2740 2680 unsigned long flags; 2741 2681 struct r10conf *conf = mddev->private; ··· 3220 3158 else { 3221 3159 bad_sectors -= (sector - first_bad); 3222 3160 if (max_sync > bad_sectors) 3223 - max_sync = max_sync; 3161 + max_sync = bad_sectors; 3224 3162 continue; 3225 3163 } 3226 3164 } ··· 3544 3482 sector_t size; 3545 3483 sector_t min_offset_diff = 0; 3546 3484 int first = 1; 3485 + bool discard_supported = false; 3547 3486 3548 3487 if (mddev->private == NULL) { 3549 3488 conf = setup_conf(mddev); ··· 3561 3498 3562 3499 chunk_size = mddev->chunk_sectors << 9; 3563 3500 if (mddev->queue) { 3501 + blk_queue_max_discard_sectors(mddev->queue, 3502 + mddev->chunk_sectors); 3564 3503 blk_queue_io_min(mddev->queue, chunk_size); 3565 3504 if (conf->geo.raid_disks % conf->geo.near_copies) 3566 3505 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); ··· 3608 3543 rdev->data_offset << 9); 3609 3544 3610 3545 disk->head_position = 0; 3546 + 3547 + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 3548 + discard_supported = true; 3611 3549 } 3550 + 3551 + if (discard_supported) 3552 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 3553 + else 3554 + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 3612 3555 3613 3556 /* need to check that every block has at least one working mirror */ 3614 3557 if (!enough(conf, -1)) {

+197 -22

drivers/md/raid5.c

··· 551 551 rw = WRITE_FUA; 552 552 else 553 553 rw = WRITE; 554 + if (test_bit(R5_Discard, &sh->dev[i].flags)) 555 + rw |= REQ_DISCARD; 554 556 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 555 557 rw = READ; 556 558 else if (test_and_clear_bit(R5_WantReplace, ··· 1176 1174 set_bit(R5_WantFUA, &dev->flags); 1177 1175 if (wbi->bi_rw & REQ_SYNC) 1178 1176 set_bit(R5_SyncIO, &dev->flags); 1179 - tx = async_copy_data(1, wbi, dev->page, 1180 - dev->sector, tx); 1177 + if (wbi->bi_rw & REQ_DISCARD) 1178 + set_bit(R5_Discard, &dev->flags); 1179 + else 1180 + tx = async_copy_data(1, wbi, dev->page, 1181 + dev->sector, tx); 1181 1182 wbi = r5_next_bio(wbi, dev->sector); 1182 1183 } 1183 1184 } ··· 1196 1191 int pd_idx = sh->pd_idx; 1197 1192 int qd_idx = sh->qd_idx; 1198 1193 int i; 1199 - bool fua = false, sync = false; 1194 + bool fua = false, sync = false, discard = false; 1200 1195 1201 1196 pr_debug("%s: stripe %llu\n", __func__, 1202 1197 (unsigned long long)sh->sector); ··· 1204 1199 for (i = disks; i--; ) { 1205 1200 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1206 1201 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1202 + discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1207 1203 } 1208 1204 1209 1205 for (i = disks; i--; ) { 1210 1206 struct r5dev *dev = &sh->dev[i]; 1211 1207 1212 1208 if (dev->written || i == pd_idx || i == qd_idx) { 1213 - set_bit(R5_UPTODATE, &dev->flags); 1209 + if (!discard) 1210 + set_bit(R5_UPTODATE, &dev->flags); 1214 1211 if (fua) 1215 1212 set_bit(R5_WantFUA, &dev->flags); 1216 1213 if (sync) ··· 1248 1241 pr_debug("%s: stripe %llu\n", __func__, 1249 1242 (unsigned long long)sh->sector); 1250 1243 1244 + for (i = 0; i < sh->disks; i++) { 1245 + if (pd_idx == i) 1246 + continue; 1247 + if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1248 + break; 1249 + } 1250 + if (i >= sh->disks) { 1251 + atomic_inc(&sh->count); 1252 + set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1253 + ops_complete_reconstruct(sh); 1254 + return; 1255 + } 1251 1256 /* check if prexor is active which means only process blocks 1252 1257 * that are part of a read-modify-write (written) 1253 1258 */ ··· 1304 1285 { 1305 1286 struct async_submit_ctl submit; 1306 1287 struct page **blocks = percpu->scribble; 1307 - int count; 1288 + int count, i; 1308 1289 1309 1290 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1291 + 1292 + for (i = 0; i < sh->disks; i++) { 1293 + if (sh->pd_idx == i || sh->qd_idx == i) 1294 + continue; 1295 + if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1296 + break; 1297 + } 1298 + if (i >= sh->disks) { 1299 + atomic_inc(&sh->count); 1300 + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1301 + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1302 + ops_complete_reconstruct(sh); 1303 + return; 1304 + } 1310 1305 1311 1306 count = set_syndrome_sources(blocks, sh); 1312 1307 ··· 2441 2408 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2442 2409 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2443 2410 } 2444 - spin_unlock_irq(&sh->stripe_lock); 2445 2411 2446 2412 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2447 2413 (unsigned long long)(*bip)->bi_sector, 2448 2414 (unsigned long long)sh->sector, dd_idx); 2415 + spin_unlock_irq(&sh->stripe_lock); 2449 2416 2450 2417 if (conf->mddev->bitmap && firstwrite) { 2451 2418 bitmap_startwrite(conf->mddev->bitmap, sh->sector, ··· 2512 2479 bi = sh->dev[i].towrite; 2513 2480 sh->dev[i].towrite = NULL; 2514 2481 spin_unlock_irq(&sh->stripe_lock); 2515 - if (bi) { 2516 - s->to_write--; 2482 + if (bi) 2517 2483 bitmap_end = 1; 2518 - } 2519 2484 2520 2485 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2521 2486 wake_up(&conf->wait_for_overlap); ··· 2555 2524 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2556 2525 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2557 2526 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2527 + spin_lock_irq(&sh->stripe_lock); 2558 2528 bi = sh->dev[i].toread; 2559 2529 sh->dev[i].toread = NULL; 2530 + spin_unlock_irq(&sh->stripe_lock); 2560 2531 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2561 2532 wake_up(&conf->wait_for_overlap); 2562 - if (bi) s->to_read--; 2563 2533 while (bi && bi->bi_sector < 2564 2534 sh->dev[i].sector + STRIPE_SECTORS) { 2565 2535 struct bio *nextbi = ··· 2773 2741 if (sh->dev[i].written) { 2774 2742 dev = &sh->dev[i]; 2775 2743 if (!test_bit(R5_LOCKED, &dev->flags) && 2776 - test_bit(R5_UPTODATE, &dev->flags)) { 2744 + (test_bit(R5_UPTODATE, &dev->flags) || 2745 + test_and_clear_bit(R5_Discard, &dev->flags))) { 2777 2746 /* We can return any write requests */ 2778 2747 struct bio *wbi, *wbi2; 2779 2748 pr_debug("Return write for disc %d\n", i); ··· 2808 2775 int disks) 2809 2776 { 2810 2777 int rmw = 0, rcw = 0, i; 2811 - if (conf->max_degraded == 2) { 2812 - /* RAID6 requires 'rcw' in current implementation 2813 - * Calculate the real rcw later - for now fake it 2778 + sector_t recovery_cp = conf->mddev->recovery_cp; 2779 + 2780 + /* RAID6 requires 'rcw' in current implementation. 2781 + * Otherwise, check whether resync is now happening or should start. 2782 + * If yes, then the array is dirty (after unclean shutdown or 2783 + * initial creation), so parity in some stripes might be inconsistent. 2784 + * In this case, we need to always do reconstruct-write, to ensure 2785 + * that in case of drive failure or read-error correction, we 2786 + * generate correct data from the parity. 2787 + */ 2788 + if (conf->max_degraded == 2 || 2789 + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { 2790 + /* Calculate the real rcw later - for now make it 2814 2791 * look like rcw is cheaper 2815 2792 */ 2816 2793 rcw = 1; rmw = 2; 2794 + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", 2795 + conf->max_degraded, (unsigned long long)recovery_cp, 2796 + (unsigned long long)sh->sector); 2817 2797 } else for (i = disks; i--; ) { 2818 2798 /* would I have to read this buffer for read_modify_write */ 2819 2799 struct r5dev *dev = &sh->dev[i]; ··· 2978 2932 */ 2979 2933 set_bit(STRIPE_INSYNC, &sh->state); 2980 2934 else { 2981 - conf->mddev->resync_mismatches += STRIPE_SECTORS; 2935 + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 2982 2936 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2983 2937 /* don't try to repair!! */ 2984 2938 set_bit(STRIPE_INSYNC, &sh->state); ··· 3130 3084 */ 3131 3085 } 3132 3086 } else { 3133 - conf->mddev->resync_mismatches += STRIPE_SECTORS; 3087 + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3134 3088 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3135 3089 /* don't try to repair!! */ 3136 3090 set_bit(STRIPE_INSYNC, &sh->state); ··· 3505 3459 if (s.written && 3506 3460 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3507 3461 && !test_bit(R5_LOCKED, &pdev->flags) 3508 - && test_bit(R5_UPTODATE, &pdev->flags)))) && 3462 + && (test_bit(R5_UPTODATE, &pdev->flags) || 3463 + test_bit(R5_Discard, &pdev->flags))))) && 3509 3464 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3510 3465 && !test_bit(R5_LOCKED, &qdev->flags) 3511 - && test_bit(R5_UPTODATE, &qdev->flags))))) 3466 + && (test_bit(R5_UPTODATE, &qdev->flags) || 3467 + test_bit(R5_Discard, &qdev->flags)))))) 3512 3468 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3513 3469 3514 3470 /* Now we might consider reading some blocks, either to check/generate ··· 3537 3489 /* All the 'written' buffers and the parity block are ready to 3538 3490 * be written back to disk 3539 3491 */ 3540 - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3492 + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 3493 + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 3541 3494 BUG_ON(sh->qd_idx >= 0 && 3542 - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3495 + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 3496 + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 3543 3497 for (i = disks; i--; ) { 3544 3498 struct r5dev *dev = &sh->dev[i]; 3545 3499 if (test_bit(R5_LOCKED, &dev->flags) && ··· 4122 4072 release_stripe(sh); 4123 4073 } 4124 4074 4075 + static void make_discard_request(struct mddev *mddev, struct bio *bi) 4076 + { 4077 + struct r5conf *conf = mddev->private; 4078 + sector_t logical_sector, last_sector; 4079 + struct stripe_head *sh; 4080 + int remaining; 4081 + int stripe_sectors; 4082 + 4083 + if (mddev->reshape_position != MaxSector) 4084 + /* Skip discard while reshape is happening */ 4085 + return; 4086 + 4087 + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4088 + last_sector = bi->bi_sector + (bi->bi_size>>9); 4089 + 4090 + bi->bi_next = NULL; 4091 + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 4092 + 4093 + stripe_sectors = conf->chunk_sectors * 4094 + (conf->raid_disks - conf->max_degraded); 4095 + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 4096 + stripe_sectors); 4097 + sector_div(last_sector, stripe_sectors); 4098 + 4099 + logical_sector *= conf->chunk_sectors; 4100 + last_sector *= conf->chunk_sectors; 4101 + 4102 + for (; logical_sector < last_sector; 4103 + logical_sector += STRIPE_SECTORS) { 4104 + DEFINE_WAIT(w); 4105 + int d; 4106 + again: 4107 + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 4108 + prepare_to_wait(&conf->wait_for_overlap, &w, 4109 + TASK_UNINTERRUPTIBLE); 4110 + spin_lock_irq(&sh->stripe_lock); 4111 + for (d = 0; d < conf->raid_disks; d++) { 4112 + if (d == sh->pd_idx || d == sh->qd_idx) 4113 + continue; 4114 + if (sh->dev[d].towrite || sh->dev[d].toread) { 4115 + set_bit(R5_Overlap, &sh->dev[d].flags); 4116 + spin_unlock_irq(&sh->stripe_lock); 4117 + release_stripe(sh); 4118 + schedule(); 4119 + goto again; 4120 + } 4121 + } 4122 + finish_wait(&conf->wait_for_overlap, &w); 4123 + for (d = 0; d < conf->raid_disks; d++) { 4124 + if (d == sh->pd_idx || d == sh->qd_idx) 4125 + continue; 4126 + sh->dev[d].towrite = bi; 4127 + set_bit(R5_OVERWRITE, &sh->dev[d].flags); 4128 + raid5_inc_bi_active_stripes(bi); 4129 + } 4130 + spin_unlock_irq(&sh->stripe_lock); 4131 + if (conf->mddev->bitmap) { 4132 + for (d = 0; 4133 + d < conf->raid_disks - conf->max_degraded; 4134 + d++) 4135 + bitmap_startwrite(mddev->bitmap, 4136 + sh->sector, 4137 + STRIPE_SECTORS, 4138 + 0); 4139 + sh->bm_seq = conf->seq_flush + 1; 4140 + set_bit(STRIPE_BIT_DELAY, &sh->state); 4141 + } 4142 + 4143 + set_bit(STRIPE_HANDLE, &sh->state); 4144 + clear_bit(STRIPE_DELAYED, &sh->state); 4145 + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4146 + atomic_inc(&conf->preread_active_stripes); 4147 + release_stripe_plug(mddev, sh); 4148 + } 4149 + 4150 + remaining = raid5_dec_bi_active_stripes(bi); 4151 + if (remaining == 0) { 4152 + md_write_end(mddev); 4153 + bio_endio(bi, 0); 4154 + } 4155 + } 4156 + 4125 4157 static void make_request(struct mddev *mddev, struct bio * bi) 4126 4158 { 4127 4159 struct r5conf *conf = mddev->private; ··· 4225 4093 mddev->reshape_position == MaxSector && 4226 4094 chunk_aligned_read(mddev,bi)) 4227 4095 return; 4096 + 4097 + if (unlikely(bi->bi_rw & REQ_DISCARD)) { 4098 + make_discard_request(mddev, bi); 4099 + return; 4100 + } 4228 4101 4229 4102 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4230 4103 last_sector = bi->bi_sector + (bi->bi_size>>9); ··· 4767 4630 * During the scan, completed stripes are saved for us by the interrupt 4768 4631 * handler, so that they will not have to wait for our next wakeup. 4769 4632 */ 4770 - static void raid5d(struct mddev *mddev) 4633 + static void raid5d(struct md_thread *thread) 4771 4634 { 4635 + struct mddev *mddev = thread->mddev; 4772 4636 struct r5conf *conf = mddev->private; 4773 4637 int handled; 4774 4638 struct blk_plug plug; ··· 5504 5366 5505 5367 if (mddev->queue) { 5506 5368 int chunk_size; 5369 + bool discard_supported = true; 5507 5370 /* read-ahead size must cover two whole stripes, which 5508 5371 * is 2 * (datadisks) * chunksize where 'n' is the 5509 5372 * number of raid devices ··· 5524 5385 blk_queue_io_min(mddev->queue, chunk_size); 5525 5386 blk_queue_io_opt(mddev->queue, chunk_size * 5526 5387 (conf->raid_disks - conf->max_degraded)); 5388 + /* 5389 + * We can only discard a whole stripe. It doesn't make sense to 5390 + * discard data disk but write parity disk 5391 + */ 5392 + stripe = stripe * PAGE_SIZE; 5393 + mddev->queue->limits.discard_alignment = stripe; 5394 + mddev->queue->limits.discard_granularity = stripe; 5395 + /* 5396 + * unaligned part of discard request will be ignored, so can't 5397 + * guarantee discard_zerors_data 5398 + */ 5399 + mddev->queue->limits.discard_zeroes_data = 0; 5527 5400 5528 5401 rdev_for_each(rdev, mddev) { 5529 5402 disk_stack_limits(mddev->gendisk, rdev->bdev, 5530 5403 rdev->data_offset << 9); 5531 5404 disk_stack_limits(mddev->gendisk, rdev->bdev, 5532 5405 rdev->new_data_offset << 9); 5406 + /* 5407 + * discard_zeroes_data is required, otherwise data 5408 + * could be lost. Consider a scenario: discard a stripe 5409 + * (the stripe could be inconsistent if 5410 + * discard_zeroes_data is 0); write one disk of the 5411 + * stripe (the stripe could be inconsistent again 5412 + * depending on which disks are used to calculate 5413 + * parity); the disk is broken; The stripe data of this 5414 + * disk is lost. 5415 + */ 5416 + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 5417 + !bdev_get_queue(rdev->bdev)-> 5418 + limits.discard_zeroes_data) 5419 + discard_supported = false; 5533 5420 } 5421 + 5422 + if (discard_supported && 5423 + mddev->queue->limits.max_discard_sectors >= stripe && 5424 + mddev->queue->limits.discard_granularity >= stripe) 5425 + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 5426 + mddev->queue); 5427 + else 5428 + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 5429 + mddev->queue); 5534 5430 } 5535 5431 5536 5432 return 0; ··· 5876 5702 if (!check_stripe_cache(mddev)) 5877 5703 return -ENOSPC; 5878 5704 5879 - return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5705 + return resize_stripes(conf, (conf->previous_raid_disks 5706 + + mddev->delta_disks)); 5880 5707 } 5881 5708 5882 5709 static int raid5_start_reshape(struct mddev *mddev)

+1

drivers/md/raid5.h

··· 298 298 R5_WantReplace, /* We need to update the replacement, we have read 299 299 * data in, and now is a good time to write it out. 300 300 */ 301 + R5_Discard, /* Discard the stripe */ 301 302 }; 302 303 303 304 /*

Configure Feed

Configure Feed