Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 11011 lines 290 kB view raw
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38*/ 39 40#include <linux/sched/mm.h> 41#include <linux/sched/signal.h> 42#include <linux/kthread.h> 43#include <linux/blkdev.h> 44#include <linux/blk-integrity.h> 45#include <linux/badblocks.h> 46#include <linux/sysctl.h> 47#include <linux/seq_file.h> 48#include <linux/fs.h> 49#include <linux/poll.h> 50#include <linux/ctype.h> 51#include <linux/string.h> 52#include <linux/hdreg.h> 53#include <linux/proc_fs.h> 54#include <linux/random.h> 55#include <linux/major.h> 56#include <linux/module.h> 57#include <linux/reboot.h> 58#include <linux/file.h> 59#include <linux/compat.h> 60#include <linux/delay.h> 61#include <linux/raid/md_p.h> 62#include <linux/raid/md_u.h> 63#include <linux/raid/detect.h> 64#include <linux/slab.h> 65#include <linux/percpu-refcount.h> 66#include <linux/part_stat.h> 67 68#include "md.h" 69#include "md-bitmap.h" 70#include "md-cluster.h" 71 72static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80}; 81 82static DEFINE_XARRAY(md_submodule); 83 84static const struct kobj_type md_ktype; 85 86static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 87 88/* 89 * This workqueue is used for sync_work to register new sync_thread, and for 90 * del_work to remove rdev, and for event_work that is only set by dm-raid. 91 * 92 * Noted that sync_work will grab reconfig_mutex, hence never flush this 93 * workqueue whith reconfig_mutex grabbed. 94 */ 95static struct workqueue_struct *md_misc_wq; 96 97static int remove_and_add_spares(struct mddev *mddev, 98 struct md_rdev *this); 99static void mddev_detach(struct mddev *mddev); 100static void export_rdev(struct md_rdev *rdev); 101static void md_wakeup_thread_directly(struct md_thread __rcu **thread); 102 103/* 104 * Default number of read corrections we'll attempt on an rdev 105 * before ejecting it from the array. We divide the read error 106 * count by 2 for every hour elapsed between read errors. 107 */ 108#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 109/* Default safemode delay: 200 msec */ 110#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 111/* 112 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' 113 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load 114 * does not show up that much. Increase it if you want to have more guaranteed 115 * speed. Note that the RAID driver will use the maximum bandwidth 116 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. 117 * 118 * Background sync IO speed control: 119 * 120 * - below speed min: 121 * no limit; 122 * - above speed min and below speed max: 123 * a) if mddev is idle, then no limit; 124 * b) if mddev is busy handling normal IO, then limit inflight sync IO 125 * to sync_io_depth; 126 * - above speed max: 127 * sync IO can't be issued; 128 * 129 * Following configurations can be changed via /proc/sys/dev/raid/ for system 130 * or /sys/block/mdX/md/ for one array. 131 */ 132static int sysctl_speed_limit_min = 1000; 133static int sysctl_speed_limit_max = 200000; 134static int sysctl_sync_io_depth = 32; 135 136static int speed_min(struct mddev *mddev) 137{ 138 return mddev->sync_speed_min ? 139 mddev->sync_speed_min : sysctl_speed_limit_min; 140} 141 142static int speed_max(struct mddev *mddev) 143{ 144 return mddev->sync_speed_max ? 145 mddev->sync_speed_max : sysctl_speed_limit_max; 146} 147 148static int sync_io_depth(struct mddev *mddev) 149{ 150 return mddev->sync_io_depth ? 151 mddev->sync_io_depth : sysctl_sync_io_depth; 152} 153 154static void rdev_uninit_serial(struct md_rdev *rdev) 155{ 156 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 157 return; 158 159 kvfree(rdev->serial); 160 rdev->serial = NULL; 161} 162 163static void rdevs_uninit_serial(struct mddev *mddev) 164{ 165 struct md_rdev *rdev; 166 167 rdev_for_each(rdev, mddev) 168 rdev_uninit_serial(rdev); 169} 170 171static int rdev_init_serial(struct md_rdev *rdev) 172{ 173 /* serial_nums equals with BARRIER_BUCKETS_NR */ 174 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 175 struct serial_in_rdev *serial = NULL; 176 177 if (test_bit(CollisionCheck, &rdev->flags)) 178 return 0; 179 180 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 181 GFP_KERNEL); 182 if (!serial) 183 return -ENOMEM; 184 185 for (i = 0; i < serial_nums; i++) { 186 struct serial_in_rdev *serial_tmp = &serial[i]; 187 188 spin_lock_init(&serial_tmp->serial_lock); 189 serial_tmp->serial_rb = RB_ROOT_CACHED; 190 } 191 192 rdev->serial = serial; 193 set_bit(CollisionCheck, &rdev->flags); 194 195 return 0; 196} 197 198static int rdevs_init_serial(struct mddev *mddev) 199{ 200 struct md_rdev *rdev; 201 int ret = 0; 202 203 rdev_for_each(rdev, mddev) { 204 ret = rdev_init_serial(rdev); 205 if (ret) 206 break; 207 } 208 209 /* Free all resources if pool is not existed */ 210 if (ret && !mddev->serial_info_pool) 211 rdevs_uninit_serial(mddev); 212 213 return ret; 214} 215 216/* 217 * rdev needs to enable serial stuffs if it meets the conditions: 218 * 1. it is multi-queue device flaged with writemostly. 219 * 2. the write-behind mode is enabled. 220 */ 221static int rdev_need_serial(struct md_rdev *rdev) 222{ 223 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 224 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 225 test_bit(WriteMostly, &rdev->flags)); 226} 227 228/* 229 * Init resource for rdev(s), then create serial_info_pool if: 230 * 1. rdev is the first device which return true from rdev_enable_serial. 231 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 232 */ 233void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 234{ 235 int ret = 0; 236 237 if (rdev && !rdev_need_serial(rdev) && 238 !test_bit(CollisionCheck, &rdev->flags)) 239 return; 240 241 if (!rdev) 242 ret = rdevs_init_serial(mddev); 243 else 244 ret = rdev_init_serial(rdev); 245 if (ret) 246 return; 247 248 if (mddev->serial_info_pool == NULL) { 249 /* 250 * already in memalloc noio context by 251 * mddev_suspend() 252 */ 253 mddev->serial_info_pool = 254 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 255 sizeof(struct serial_info)); 256 if (!mddev->serial_info_pool) { 257 rdevs_uninit_serial(mddev); 258 pr_err("can't alloc memory pool for serialization\n"); 259 } 260 } 261} 262 263/* 264 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 265 * 1. rdev is the last device flaged with CollisionCheck. 266 * 2. when bitmap is destroyed while policy is not enabled. 267 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 268 */ 269void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 270{ 271 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 272 return; 273 274 if (mddev->serial_info_pool) { 275 struct md_rdev *temp; 276 int num = 0; /* used to track if other rdevs need the pool */ 277 278 rdev_for_each(temp, mddev) { 279 if (!rdev) { 280 if (!test_bit(MD_SERIALIZE_POLICY, 281 &mddev->flags) || 282 !rdev_need_serial(temp)) 283 rdev_uninit_serial(temp); 284 else 285 num++; 286 } else if (temp != rdev && 287 test_bit(CollisionCheck, &temp->flags)) 288 num++; 289 } 290 291 if (rdev) 292 rdev_uninit_serial(rdev); 293 294 if (num) 295 pr_info("The mempool could be used by other devices\n"); 296 else { 297 mempool_destroy(mddev->serial_info_pool); 298 mddev->serial_info_pool = NULL; 299 } 300 } 301} 302 303static struct ctl_table_header *raid_table_header; 304 305static const struct ctl_table raid_table[] = { 306 { 307 .procname = "speed_limit_min", 308 .data = &sysctl_speed_limit_min, 309 .maxlen = sizeof(int), 310 .mode = 0644, 311 .proc_handler = proc_dointvec, 312 }, 313 { 314 .procname = "speed_limit_max", 315 .data = &sysctl_speed_limit_max, 316 .maxlen = sizeof(int), 317 .mode = 0644, 318 .proc_handler = proc_dointvec, 319 }, 320 { 321 .procname = "sync_io_depth", 322 .data = &sysctl_sync_io_depth, 323 .maxlen = sizeof(int), 324 .mode = 0644, 325 .proc_handler = proc_dointvec, 326 }, 327}; 328 329static int start_readonly; 330 331/* 332 * The original mechanism for creating an md device is to create 333 * a device node in /dev and to open it. This causes races with device-close. 334 * The preferred method is to write to the "new_array" module parameter. 335 * This can avoid races. 336 * Setting create_on_open to false disables the original mechanism 337 * so all the races disappear. 338 */ 339static bool create_on_open = true; 340static bool legacy_async_del_gendisk = true; 341static bool check_new_feature = true; 342 343/* 344 * We have a system wide 'event count' that is incremented 345 * on any 'interesting' event, and readers of /proc/mdstat 346 * can use 'poll' or 'select' to find out when the event 347 * count increases. 348 * 349 * Events are: 350 * start array, stop array, error, add device, remove device, 351 * start build, activate spare 352 */ 353static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 354static atomic_t md_event_count; 355void md_new_event(void) 356{ 357 atomic_inc(&md_event_count); 358 wake_up(&md_event_waiters); 359} 360EXPORT_SYMBOL_GPL(md_new_event); 361 362/* 363 * Enables to iterate over all existing md arrays 364 * all_mddevs_lock protects this list. 365 */ 366static LIST_HEAD(all_mddevs); 367static DEFINE_SPINLOCK(all_mddevs_lock); 368 369static bool is_md_suspended(struct mddev *mddev) 370{ 371 return percpu_ref_is_dying(&mddev->active_io); 372} 373/* Rather than calling directly into the personality make_request function, 374 * IO requests come here first so that we can check if the device is 375 * being suspended pending a reconfiguration. 376 * We hold a refcount over the call to ->make_request. By the time that 377 * call has finished, the bio has been linked into some internal structure 378 * and so is visible to ->quiesce(), so we don't need the refcount any more. 379 */ 380static bool is_suspended(struct mddev *mddev, struct bio *bio) 381{ 382 if (is_md_suspended(mddev)) 383 return true; 384 if (bio_data_dir(bio) != WRITE) 385 return false; 386 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 387 return false; 388 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 389 return false; 390 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 391 return false; 392 return true; 393} 394 395bool md_handle_request(struct mddev *mddev, struct bio *bio) 396{ 397check_suspended: 398 if (is_suspended(mddev, bio)) { 399 /* Bail out if REQ_NOWAIT is set for the bio */ 400 if (bio->bi_opf & REQ_NOWAIT) { 401 bio_wouldblock_error(bio); 402 return true; 403 } 404 wait_event(mddev->sb_wait, !is_suspended(mddev, bio)); 405 } 406 if (!percpu_ref_tryget_live(&mddev->active_io)) 407 goto check_suspended; 408 409 if (!mddev->pers->make_request(mddev, bio)) { 410 percpu_ref_put(&mddev->active_io); 411 if (mddev_is_dm(mddev) && mddev->pers->prepare_suspend) 412 return false; 413 goto check_suspended; 414 } 415 416 percpu_ref_put(&mddev->active_io); 417 return true; 418} 419EXPORT_SYMBOL(md_handle_request); 420 421static void md_submit_bio(struct bio *bio) 422{ 423 const int rw = bio_data_dir(bio); 424 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 425 426 if (mddev == NULL || mddev->pers == NULL) { 427 bio_io_error(bio); 428 return; 429 } 430 431 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 432 bio_io_error(bio); 433 return; 434 } 435 436 bio = bio_split_to_limits(bio); 437 if (!bio) 438 return; 439 440 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 441 if (bio_sectors(bio) != 0) 442 bio->bi_status = BLK_STS_IOERR; 443 bio_endio(bio); 444 return; 445 } 446 447 /* bio could be mergeable after passing to underlayer */ 448 bio->bi_opf &= ~REQ_NOMERGE; 449 450 md_handle_request(mddev, bio); 451} 452 453/* 454 * Make sure no new requests are submitted to the device, and any requests that 455 * have been submitted are completely handled. 456 */ 457int mddev_suspend(struct mddev *mddev, bool interruptible) 458{ 459 int err = 0; 460 461 /* 462 * hold reconfig_mutex to wait for normal io will deadlock, because 463 * other context can't update super_block, and normal io can rely on 464 * updating super_block. 465 */ 466 lockdep_assert_not_held(&mddev->reconfig_mutex); 467 468 if (interruptible) 469 err = mutex_lock_interruptible(&mddev->suspend_mutex); 470 else 471 mutex_lock(&mddev->suspend_mutex); 472 if (err) 473 return err; 474 475 if (mddev->suspended) { 476 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 477 mutex_unlock(&mddev->suspend_mutex); 478 return 0; 479 } 480 481 percpu_ref_kill(&mddev->active_io); 482 483 /* 484 * RAID456 IO can sleep in wait_for_reshape while still holding an 485 * active_io reference. If reshape is already interrupted or frozen, 486 * wake those waiters so they can abort and drop the reference instead 487 * of deadlocking suspend. 488 */ 489 if (mddev->pers && mddev->pers->prepare_suspend && 490 reshape_interrupted(mddev)) 491 mddev->pers->prepare_suspend(mddev); 492 493 if (interruptible) 494 err = wait_event_interruptible(mddev->sb_wait, 495 percpu_ref_is_zero(&mddev->active_io)); 496 else 497 wait_event(mddev->sb_wait, 498 percpu_ref_is_zero(&mddev->active_io)); 499 if (err) { 500 percpu_ref_resurrect(&mddev->active_io); 501 mutex_unlock(&mddev->suspend_mutex); 502 return err; 503 } 504 505 /* 506 * For raid456, io might be waiting for reshape to make progress, 507 * allow new reshape to start while waiting for io to be done to 508 * prevent deadlock. 509 */ 510 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 511 512 /* restrict memory reclaim I/O during raid array is suspend */ 513 mddev->noio_flag = memalloc_noio_save(); 514 515 mutex_unlock(&mddev->suspend_mutex); 516 return 0; 517} 518EXPORT_SYMBOL_GPL(mddev_suspend); 519 520static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 521{ 522 lockdep_assert_not_held(&mddev->reconfig_mutex); 523 524 mutex_lock(&mddev->suspend_mutex); 525 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 526 if (mddev->suspended) { 527 mutex_unlock(&mddev->suspend_mutex); 528 return; 529 } 530 531 /* entred the memalloc scope from mddev_suspend() */ 532 memalloc_noio_restore(mddev->noio_flag); 533 534 percpu_ref_resurrect(&mddev->active_io); 535 wake_up(&mddev->sb_wait); 536 537 if (recovery_needed) 538 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 539 md_wakeup_thread(mddev->thread); 540 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 541 542 mutex_unlock(&mddev->suspend_mutex); 543} 544 545void mddev_resume(struct mddev *mddev) 546{ 547 return __mddev_resume(mddev, true); 548} 549EXPORT_SYMBOL_GPL(mddev_resume); 550 551/* sync bdev before setting device to readonly or stopping raid*/ 552static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 553{ 554 mutex_lock(&mddev->open_mutex); 555 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 556 mutex_unlock(&mddev->open_mutex); 557 return -EBUSY; 558 } 559 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 560 mutex_unlock(&mddev->open_mutex); 561 return -EBUSY; 562 } 563 mutex_unlock(&mddev->open_mutex); 564 565 sync_blockdev(mddev->gendisk->part0); 566 return 0; 567} 568 569/* 570 * The only difference from bio_chain_endio() is that the current 571 * bi_status of bio does not affect the bi_status of parent. 572 */ 573static void md_end_flush(struct bio *bio) 574{ 575 struct bio *parent = bio->bi_private; 576 577 /* 578 * If any flush io error before the power failure, 579 * disk data may be lost. 580 */ 581 if (bio->bi_status) 582 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 583 blk_status_to_errno(bio->bi_status)); 584 585 bio_put(bio); 586 bio_endio(parent); 587} 588 589bool md_flush_request(struct mddev *mddev, struct bio *bio) 590{ 591 struct md_rdev *rdev; 592 struct bio *new; 593 594 /* 595 * md_flush_reqeust() should be called under md_handle_request() and 596 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 597 * without rcu protection. 598 */ 599 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 600 601 rdev_for_each(rdev, mddev) { 602 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 603 continue; 604 605 new = bio_alloc_bioset(rdev->bdev, 0, 606 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 607 &mddev->bio_set); 608 new->bi_private = bio; 609 new->bi_end_io = md_end_flush; 610 bio_inc_remaining(bio); 611 submit_bio(new); 612 } 613 614 if (bio_sectors(bio) == 0) { 615 bio_endio(bio); 616 return true; 617 } 618 619 bio->bi_opf &= ~REQ_PREFLUSH; 620 return false; 621} 622EXPORT_SYMBOL(md_flush_request); 623 624static inline struct mddev *mddev_get(struct mddev *mddev) 625{ 626 lockdep_assert_held(&all_mddevs_lock); 627 628 if (test_bit(MD_DELETED, &mddev->flags)) 629 return NULL; 630 atomic_inc(&mddev->active); 631 return mddev; 632} 633 634static void mddev_delayed_delete(struct work_struct *ws); 635 636static void __mddev_put(struct mddev *mddev) 637{ 638 if (mddev->raid_disks || !list_empty(&mddev->disks) || 639 mddev->ctime || mddev->hold_active) 640 return; 641 642 /* 643 * If array is freed by stopping array, MD_DELETED is set by 644 * do_md_stop(), MD_DELETED is still set here in case mddev is freed 645 * directly by closing a mddev that is created by create_on_open. 646 */ 647 set_bit(MD_DELETED, &mddev->flags); 648 /* 649 * Call queue_work inside the spinlock so that flush_workqueue() after 650 * mddev_find will succeed in waiting for the work to be done. 651 */ 652 queue_work(md_misc_wq, &mddev->del_work); 653} 654 655static void mddev_put_locked(struct mddev *mddev) 656{ 657 if (atomic_dec_and_test(&mddev->active)) 658 __mddev_put(mddev); 659} 660 661void mddev_put(struct mddev *mddev) 662{ 663 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 664 return; 665 666 __mddev_put(mddev); 667 spin_unlock(&all_mddevs_lock); 668} 669 670static void md_safemode_timeout(struct timer_list *t); 671static void md_start_sync(struct work_struct *ws); 672 673static void active_io_release(struct percpu_ref *ref) 674{ 675 struct mddev *mddev = container_of(ref, struct mddev, active_io); 676 677 wake_up(&mddev->sb_wait); 678} 679 680static void no_op(struct percpu_ref *r) {} 681 682static void md_bitmap_sysfs_add(struct mddev *mddev) 683{ 684 if (sysfs_update_groups(&mddev->kobj, mddev->bitmap_ops->groups)) 685 pr_warn("md: cannot register extra bitmap attributes for %s\n", 686 mdname(mddev)); 687 else 688 /* 689 * Inform user with KOBJ_CHANGE about new bitmap 690 * attributes. 691 */ 692 kobject_uevent(&mddev->kobj, KOBJ_CHANGE); 693} 694 695static void md_bitmap_sysfs_del(struct mddev *mddev) 696{ 697 int nr_groups = 0; 698 699 for (nr_groups = 0; mddev->bitmap_ops->groups[nr_groups]; nr_groups++) 700 ; 701 702 while (--nr_groups >= 1) 703 sysfs_unmerge_group(&mddev->kobj, 704 mddev->bitmap_ops->groups[nr_groups]); 705 sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->groups[0]); 706} 707 708bool mddev_set_bitmap_ops_nosysfs(struct mddev *mddev) 709{ 710 struct md_submodule_head *head; 711 712 if (mddev->bitmap_ops && 713 mddev->bitmap_ops->head.id == mddev->bitmap_id) 714 return true; 715 716 xa_lock(&md_submodule); 717 head = xa_load(&md_submodule, mddev->bitmap_id); 718 719 if (!head) { 720 pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); 721 goto err; 722 } 723 724 if (head->type != MD_BITMAP) { 725 pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); 726 goto err; 727 } 728 729 mddev->bitmap_ops = (void *)head; 730 xa_unlock(&md_submodule); 731 return true; 732 733err: 734 xa_unlock(&md_submodule); 735 return false; 736} 737 738int mddev_init(struct mddev *mddev) 739{ 740 int err = 0; 741 742 if (!IS_ENABLED(CONFIG_MD_BITMAP)) 743 mddev->bitmap_id = ID_BITMAP_NONE; 744 else 745 mddev->bitmap_id = ID_BITMAP; 746 747 if (percpu_ref_init(&mddev->active_io, active_io_release, 748 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 749 return -ENOMEM; 750 751 if (percpu_ref_init(&mddev->writes_pending, no_op, 752 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 753 err = -ENOMEM; 754 goto exit_acitve_io; 755 } 756 757 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 758 if (err) 759 goto exit_writes_pending; 760 761 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 762 if (err) 763 goto exit_bio_set; 764 765 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 766 offsetof(struct md_io_clone, bio_clone), 0); 767 if (err) 768 goto exit_sync_set; 769 770 /* We want to start with the refcount at zero */ 771 percpu_ref_put(&mddev->writes_pending); 772 773 mutex_init(&mddev->open_mutex); 774 mutex_init(&mddev->reconfig_mutex); 775 mutex_init(&mddev->suspend_mutex); 776 mutex_init(&mddev->bitmap_info.mutex); 777 INIT_LIST_HEAD(&mddev->disks); 778 INIT_LIST_HEAD(&mddev->all_mddevs); 779 INIT_LIST_HEAD(&mddev->deleting); 780 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 781 atomic_set(&mddev->active, 1); 782 atomic_set(&mddev->openers, 0); 783 atomic_set(&mddev->sync_seq, 0); 784 spin_lock_init(&mddev->lock); 785 init_waitqueue_head(&mddev->sb_wait); 786 init_waitqueue_head(&mddev->recovery_wait); 787 mddev->reshape_position = MaxSector; 788 mddev->reshape_backwards = 0; 789 mddev->last_sync_action = ACTION_IDLE; 790 mddev->resync_min = 0; 791 mddev->resync_max = MaxSector; 792 mddev->level = LEVEL_NONE; 793 794 INIT_WORK(&mddev->sync_work, md_start_sync); 795 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 796 797 return 0; 798 799exit_sync_set: 800 bioset_exit(&mddev->sync_set); 801exit_bio_set: 802 bioset_exit(&mddev->bio_set); 803exit_writes_pending: 804 percpu_ref_exit(&mddev->writes_pending); 805exit_acitve_io: 806 percpu_ref_exit(&mddev->active_io); 807 return err; 808} 809EXPORT_SYMBOL_GPL(mddev_init); 810 811void mddev_destroy(struct mddev *mddev) 812{ 813 bioset_exit(&mddev->bio_set); 814 bioset_exit(&mddev->sync_set); 815 bioset_exit(&mddev->io_clone_set); 816 percpu_ref_exit(&mddev->active_io); 817 percpu_ref_exit(&mddev->writes_pending); 818} 819EXPORT_SYMBOL_GPL(mddev_destroy); 820 821static struct mddev *mddev_find_locked(dev_t unit) 822{ 823 struct mddev *mddev; 824 825 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 826 if (mddev->unit == unit) 827 return mddev; 828 829 return NULL; 830} 831 832/* find an unused unit number */ 833static dev_t mddev_alloc_unit(void) 834{ 835 static int next_minor = 512; 836 int start = next_minor; 837 bool is_free = 0; 838 dev_t dev = 0; 839 840 while (!is_free) { 841 dev = MKDEV(MD_MAJOR, next_minor); 842 next_minor++; 843 if (next_minor > MINORMASK) 844 next_minor = 0; 845 if (next_minor == start) 846 return 0; /* Oh dear, all in use. */ 847 is_free = !mddev_find_locked(dev); 848 } 849 850 return dev; 851} 852 853static struct mddev *mddev_alloc(dev_t unit) 854{ 855 struct mddev *new; 856 int error; 857 858 if (unit && MAJOR(unit) != MD_MAJOR) 859 unit &= ~((1 << MdpMinorShift) - 1); 860 861 new = kzalloc_obj(*new); 862 if (!new) 863 return ERR_PTR(-ENOMEM); 864 865 error = mddev_init(new); 866 if (error) 867 goto out_free_new; 868 869 spin_lock(&all_mddevs_lock); 870 if (unit) { 871 error = -EEXIST; 872 if (mddev_find_locked(unit)) 873 goto out_destroy_new; 874 new->unit = unit; 875 if (MAJOR(unit) == MD_MAJOR) 876 new->md_minor = MINOR(unit); 877 else 878 new->md_minor = MINOR(unit) >> MdpMinorShift; 879 new->hold_active = UNTIL_IOCTL; 880 } else { 881 error = -ENODEV; 882 new->unit = mddev_alloc_unit(); 883 if (!new->unit) 884 goto out_destroy_new; 885 new->md_minor = MINOR(new->unit); 886 new->hold_active = UNTIL_STOP; 887 } 888 889 list_add(&new->all_mddevs, &all_mddevs); 890 spin_unlock(&all_mddevs_lock); 891 return new; 892 893out_destroy_new: 894 spin_unlock(&all_mddevs_lock); 895 mddev_destroy(new); 896out_free_new: 897 kfree(new); 898 return ERR_PTR(error); 899} 900 901static void mddev_free(struct mddev *mddev) 902{ 903 spin_lock(&all_mddevs_lock); 904 list_del(&mddev->all_mddevs); 905 spin_unlock(&all_mddevs_lock); 906 907 mddev_destroy(mddev); 908 kfree(mddev); 909} 910 911static const struct attribute_group md_redundancy_group; 912 913void mddev_unlock(struct mddev *mddev) 914{ 915 struct md_rdev *rdev; 916 struct md_rdev *tmp; 917 LIST_HEAD(delete); 918 919 if (!list_empty(&mddev->deleting)) 920 list_splice_init(&mddev->deleting, &delete); 921 922 if (mddev->to_remove) { 923 /* These cannot be removed under reconfig_mutex as 924 * an access to the files will try to take reconfig_mutex 925 * while holding the file unremovable, which leads to 926 * a deadlock. 927 * So hold set sysfs_active while the remove in happeing, 928 * and anything else which might set ->to_remove or my 929 * otherwise change the sysfs namespace will fail with 930 * -EBUSY if sysfs_active is still set. 931 * We set sysfs_active under reconfig_mutex and elsewhere 932 * test it under the same mutex to ensure its correct value 933 * is seen. 934 */ 935 const struct attribute_group *to_remove = mddev->to_remove; 936 mddev->to_remove = NULL; 937 mddev->sysfs_active = 1; 938 mutex_unlock(&mddev->reconfig_mutex); 939 940 if (mddev->kobj.sd) { 941 if (to_remove != &md_redundancy_group) 942 sysfs_remove_group(&mddev->kobj, to_remove); 943 if (mddev->pers == NULL || 944 mddev->pers->sync_request == NULL) { 945 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 946 if (mddev->sysfs_action) 947 sysfs_put(mddev->sysfs_action); 948 if (mddev->sysfs_completed) 949 sysfs_put(mddev->sysfs_completed); 950 if (mddev->sysfs_degraded) 951 sysfs_put(mddev->sysfs_degraded); 952 mddev->sysfs_action = NULL; 953 mddev->sysfs_completed = NULL; 954 mddev->sysfs_degraded = NULL; 955 } 956 } 957 mddev->sysfs_active = 0; 958 } else 959 mutex_unlock(&mddev->reconfig_mutex); 960 961 md_wakeup_thread(mddev->thread); 962 wake_up(&mddev->sb_wait); 963 964 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 965 list_del_init(&rdev->same_set); 966 kobject_del(&rdev->kobj); 967 export_rdev(rdev); 968 } 969 970 if (!legacy_async_del_gendisk) { 971 /* 972 * Call del_gendisk after release reconfig_mutex to avoid 973 * deadlock (e.g. call del_gendisk under the lock and an 974 * access to sysfs files waits the lock) 975 * And MD_DELETED is only used for md raid which is set in 976 * do_md_stop. dm raid only uses md_stop to stop. So dm raid 977 * doesn't need to check MD_DELETED when getting reconfig lock 978 */ 979 if (test_bit(MD_DELETED, &mddev->flags) && 980 !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) { 981 kobject_del(&mddev->kobj); 982 del_gendisk(mddev->gendisk); 983 } 984 } 985} 986EXPORT_SYMBOL_GPL(mddev_unlock); 987 988struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 989{ 990 struct md_rdev *rdev; 991 992 rdev_for_each_rcu(rdev, mddev) 993 if (rdev->desc_nr == nr) 994 return rdev; 995 996 return NULL; 997} 998EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 999 1000static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 1001{ 1002 struct md_rdev *rdev; 1003 1004 rdev_for_each(rdev, mddev) 1005 if (rdev->bdev->bd_dev == dev) 1006 return rdev; 1007 1008 return NULL; 1009} 1010 1011struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 1012{ 1013 struct md_rdev *rdev; 1014 1015 rdev_for_each_rcu(rdev, mddev) 1016 if (rdev->bdev->bd_dev == dev) 1017 return rdev; 1018 1019 return NULL; 1020} 1021EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 1022 1023static struct md_personality *get_pers(int level, char *clevel) 1024{ 1025 struct md_personality *ret = NULL; 1026 struct md_submodule_head *head; 1027 unsigned long i; 1028 1029 xa_lock(&md_submodule); 1030 xa_for_each(&md_submodule, i, head) { 1031 if (head->type != MD_PERSONALITY) 1032 continue; 1033 if ((level != LEVEL_NONE && head->id == level) || 1034 !strcmp(head->name, clevel)) { 1035 if (try_module_get(head->owner)) 1036 ret = (void *)head; 1037 break; 1038 } 1039 } 1040 xa_unlock(&md_submodule); 1041 1042 if (!ret) { 1043 if (level != LEVEL_NONE) 1044 pr_warn("md: personality for level %d is not loaded!\n", 1045 level); 1046 else 1047 pr_warn("md: personality for level %s is not loaded!\n", 1048 clevel); 1049 } 1050 1051 return ret; 1052} 1053 1054static void put_pers(struct md_personality *pers) 1055{ 1056 module_put(pers->head.owner); 1057} 1058 1059/* return the offset of the super block in 512byte sectors */ 1060static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 1061{ 1062 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 1063} 1064 1065static int alloc_disk_sb(struct md_rdev *rdev) 1066{ 1067 rdev->sb_page = alloc_page(GFP_KERNEL); 1068 if (!rdev->sb_page) 1069 return -ENOMEM; 1070 return 0; 1071} 1072 1073void md_rdev_clear(struct md_rdev *rdev) 1074{ 1075 if (rdev->sb_page) { 1076 put_page(rdev->sb_page); 1077 rdev->sb_loaded = 0; 1078 rdev->sb_page = NULL; 1079 rdev->sb_start = 0; 1080 rdev->sectors = 0; 1081 } 1082 if (rdev->bb_page) { 1083 put_page(rdev->bb_page); 1084 rdev->bb_page = NULL; 1085 } 1086 badblocks_exit(&rdev->badblocks); 1087} 1088EXPORT_SYMBOL_GPL(md_rdev_clear); 1089 1090static void super_written(struct bio *bio) 1091{ 1092 struct md_rdev *rdev = bio->bi_private; 1093 struct mddev *mddev = rdev->mddev; 1094 1095 if (bio->bi_status) { 1096 pr_err("md: %s gets error=%d\n", __func__, 1097 blk_status_to_errno(bio->bi_status)); 1098 md_error(mddev, rdev); 1099 if (!test_bit(Faulty, &rdev->flags) 1100 && (bio->bi_opf & MD_FAILFAST)) { 1101 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1102 set_bit(LastDev, &rdev->flags); 1103 } 1104 } else 1105 clear_bit(LastDev, &rdev->flags); 1106 1107 bio_put(bio); 1108 1109 rdev_dec_pending(rdev, mddev); 1110 1111 if (atomic_dec_and_test(&mddev->pending_writes)) 1112 wake_up(&mddev->sb_wait); 1113} 1114 1115/** 1116 * md_write_metadata - write metadata to underlying disk, including 1117 * array superblock, badblocks, bitmap superblock and bitmap bits. 1118 * @mddev: the array to write 1119 * @rdev: the underlying disk to write 1120 * @sector: the offset to @rdev 1121 * @size: the length of the metadata 1122 * @page: the metadata 1123 * @offset: the offset to @page 1124 * 1125 * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment 1126 * mddev->pending_writes before returning, and decrement it on completion, 1127 * waking up sb_wait. Caller must call md_super_wait() after issuing io to all 1128 * rdev. If an error occurred, md_error() will be called, and the @rdev will be 1129 * kicked out from @mddev. 1130 */ 1131void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, 1132 sector_t sector, int size, struct page *page, 1133 unsigned int offset) 1134{ 1135 struct bio *bio; 1136 1137 if (!page) 1138 return; 1139 1140 if (test_bit(Faulty, &rdev->flags)) 1141 return; 1142 1143 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1144 1, 1145 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1146 | REQ_PREFLUSH | REQ_FUA, 1147 GFP_NOIO, &mddev->sync_set); 1148 1149 atomic_inc(&rdev->nr_pending); 1150 1151 bio->bi_iter.bi_sector = sector; 1152 __bio_add_page(bio, page, size, offset); 1153 bio->bi_private = rdev; 1154 bio->bi_end_io = super_written; 1155 1156 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1157 test_bit(FailFast, &rdev->flags) && 1158 !test_bit(LastDev, &rdev->flags)) 1159 bio->bi_opf |= MD_FAILFAST; 1160 1161 atomic_inc(&mddev->pending_writes); 1162 submit_bio(bio); 1163} 1164 1165int md_super_wait(struct mddev *mddev) 1166{ 1167 /* wait for all superblock writes that were scheduled to complete */ 1168 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1169 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1170 return -EAGAIN; 1171 return 0; 1172} 1173 1174int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1175 struct page *page, blk_opf_t opf, bool metadata_op) 1176{ 1177 struct bio bio; 1178 struct bio_vec bvec; 1179 1180 if (metadata_op && rdev->meta_bdev) 1181 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1182 else 1183 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1184 1185 if (metadata_op) 1186 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1187 else if (rdev->mddev->reshape_position != MaxSector && 1188 (rdev->mddev->reshape_backwards == 1189 (sector >= rdev->mddev->reshape_position))) 1190 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1191 else 1192 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1193 __bio_add_page(&bio, page, size, 0); 1194 1195 submit_bio_wait(&bio); 1196 1197 return !bio.bi_status; 1198} 1199EXPORT_SYMBOL_GPL(sync_page_io); 1200 1201static int read_disk_sb(struct md_rdev *rdev, int size) 1202{ 1203 if (rdev->sb_loaded) 1204 return 0; 1205 1206 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1207 goto fail; 1208 rdev->sb_loaded = 1; 1209 return 0; 1210 1211fail: 1212 pr_err("md: disabled device %pg, could not read superblock.\n", 1213 rdev->bdev); 1214 return -EINVAL; 1215} 1216 1217static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1218{ 1219 return sb1->set_uuid0 == sb2->set_uuid0 && 1220 sb1->set_uuid1 == sb2->set_uuid1 && 1221 sb1->set_uuid2 == sb2->set_uuid2 && 1222 sb1->set_uuid3 == sb2->set_uuid3; 1223} 1224 1225static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1226{ 1227 int ret; 1228 mdp_super_t *tmp1, *tmp2; 1229 1230 tmp1 = kmalloc_obj(*tmp1); 1231 tmp2 = kmalloc_obj(*tmp2); 1232 1233 if (!tmp1 || !tmp2) { 1234 ret = 0; 1235 goto abort; 1236 } 1237 1238 *tmp1 = *sb1; 1239 *tmp2 = *sb2; 1240 1241 /* 1242 * nr_disks is not constant 1243 */ 1244 tmp1->nr_disks = 0; 1245 tmp2->nr_disks = 0; 1246 1247 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1248abort: 1249 kfree(tmp1); 1250 kfree(tmp2); 1251 return ret; 1252} 1253 1254static u32 md_csum_fold(u32 csum) 1255{ 1256 csum = (csum & 0xffff) + (csum >> 16); 1257 return (csum & 0xffff) + (csum >> 16); 1258} 1259 1260static unsigned int calc_sb_csum(mdp_super_t *sb) 1261{ 1262 u64 newcsum = 0; 1263 u32 *sb32 = (u32*)sb; 1264 int i; 1265 unsigned int disk_csum, csum; 1266 1267 disk_csum = sb->sb_csum; 1268 sb->sb_csum = 0; 1269 1270 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1271 newcsum += sb32[i]; 1272 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1273 1274#ifdef CONFIG_ALPHA 1275 /* This used to use csum_partial, which was wrong for several 1276 * reasons including that different results are returned on 1277 * different architectures. It isn't critical that we get exactly 1278 * the same return value as before (we always csum_fold before 1279 * testing, and that removes any differences). However as we 1280 * know that csum_partial always returned a 16bit value on 1281 * alphas, do a fold to maximise conformity to previous behaviour. 1282 */ 1283 sb->sb_csum = md_csum_fold(disk_csum); 1284#else 1285 sb->sb_csum = disk_csum; 1286#endif 1287 return csum; 1288} 1289 1290/* 1291 * Handle superblock details. 1292 * We want to be able to handle multiple superblock formats 1293 * so we have a common interface to them all, and an array of 1294 * different handlers. 1295 * We rely on user-space to write the initial superblock, and support 1296 * reading and updating of superblocks. 1297 * Interface methods are: 1298 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1299 * loads and validates a superblock on dev. 1300 * if refdev != NULL, compare superblocks on both devices 1301 * Return: 1302 * 0 - dev has a superblock that is compatible with refdev 1303 * 1 - dev has a superblock that is compatible and newer than refdev 1304 * so dev should be used as the refdev in future 1305 * -EINVAL superblock incompatible or invalid 1306 * -othererror e.g. -EIO 1307 * 1308 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1309 * Verify that dev is acceptable into mddev. 1310 * The first time, mddev->raid_disks will be 0, and data from 1311 * dev should be merged in. Subsequent calls check that dev 1312 * is new enough. Return 0 or -EINVAL 1313 * 1314 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1315 * Update the superblock for rdev with data in mddev 1316 * This does not write to disc. 1317 * 1318 */ 1319 1320struct super_type { 1321 char *name; 1322 struct module *owner; 1323 int (*load_super)(struct md_rdev *rdev, 1324 struct md_rdev *refdev, 1325 int minor_version); 1326 int (*validate_super)(struct mddev *mddev, 1327 struct md_rdev *freshest, 1328 struct md_rdev *rdev); 1329 void (*sync_super)(struct mddev *mddev, 1330 struct md_rdev *rdev); 1331 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1332 sector_t num_sectors); 1333 int (*allow_new_offset)(struct md_rdev *rdev, 1334 unsigned long long new_offset); 1335}; 1336 1337/* 1338 * Check that the given mddev has no bitmap. 1339 * 1340 * This function is called from the run method of all personalities that do not 1341 * support bitmaps. It prints an error message and returns non-zero if mddev 1342 * has a bitmap. Otherwise, it returns 0. 1343 * 1344 */ 1345int md_check_no_bitmap(struct mddev *mddev) 1346{ 1347 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1348 return 0; 1349 pr_warn("%s: bitmaps are not supported for %s\n", 1350 mdname(mddev), mddev->pers->head.name); 1351 return 1; 1352} 1353EXPORT_SYMBOL(md_check_no_bitmap); 1354 1355/* 1356 * load_super for 0.90.0 1357 */ 1358static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1359{ 1360 mdp_super_t *sb; 1361 int ret; 1362 bool spare_disk = true; 1363 1364 /* 1365 * Calculate the position of the superblock (512byte sectors), 1366 * it's at the end of the disk. 1367 * 1368 * It also happens to be a multiple of 4Kb. 1369 */ 1370 rdev->sb_start = calc_dev_sboffset(rdev); 1371 1372 ret = read_disk_sb(rdev, MD_SB_BYTES); 1373 if (ret) 1374 return ret; 1375 1376 ret = -EINVAL; 1377 1378 sb = page_address(rdev->sb_page); 1379 1380 if (sb->md_magic != MD_SB_MAGIC) { 1381 pr_warn("md: invalid raid superblock magic on %pg\n", 1382 rdev->bdev); 1383 goto abort; 1384 } 1385 1386 if (sb->major_version != 0 || 1387 sb->minor_version < 90 || 1388 sb->minor_version > 91) { 1389 pr_warn("Bad version number %d.%d on %pg\n", 1390 sb->major_version, sb->minor_version, rdev->bdev); 1391 goto abort; 1392 } 1393 1394 if (sb->raid_disks <= 0) 1395 goto abort; 1396 1397 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1398 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1399 goto abort; 1400 } 1401 1402 rdev->preferred_minor = sb->md_minor; 1403 rdev->data_offset = 0; 1404 rdev->new_data_offset = 0; 1405 rdev->sb_size = MD_SB_BYTES; 1406 rdev->badblocks.shift = -1; 1407 1408 rdev->desc_nr = sb->this_disk.number; 1409 1410 /* not spare disk */ 1411 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1412 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1413 spare_disk = false; 1414 1415 if (!refdev) { 1416 if (!spare_disk) 1417 ret = 1; 1418 else 1419 ret = 0; 1420 } else { 1421 __u64 ev1, ev2; 1422 mdp_super_t *refsb = page_address(refdev->sb_page); 1423 if (!md_uuid_equal(refsb, sb)) { 1424 pr_warn("md: %pg has different UUID to %pg\n", 1425 rdev->bdev, refdev->bdev); 1426 goto abort; 1427 } 1428 if (!md_sb_equal(refsb, sb)) { 1429 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1430 rdev->bdev, refdev->bdev); 1431 goto abort; 1432 } 1433 ev1 = md_event(sb); 1434 ev2 = md_event(refsb); 1435 1436 if (!spare_disk && ev1 > ev2) 1437 ret = 1; 1438 else 1439 ret = 0; 1440 } 1441 rdev->sectors = rdev->sb_start; 1442 /* Limit to 4TB as metadata cannot record more than that. 1443 * (not needed for Linear and RAID0 as metadata doesn't 1444 * record this size) 1445 */ 1446 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1447 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1448 1449 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1450 /* "this cannot possibly happen" ... */ 1451 ret = -EINVAL; 1452 1453 abort: 1454 return ret; 1455} 1456 1457static u64 md_bitmap_events_cleared(struct mddev *mddev) 1458{ 1459 struct md_bitmap_stats stats; 1460 int err; 1461 1462 if (!md_bitmap_enabled(mddev, false)) 1463 return 0; 1464 1465 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1466 if (err) 1467 return 0; 1468 1469 return stats.events_cleared; 1470} 1471 1472/* 1473 * validate_super for 0.90.0 1474 * note: we are not using "freshest" for 0.9 superblock 1475 */ 1476static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1477{ 1478 mdp_disk_t *desc; 1479 mdp_super_t *sb = page_address(rdev->sb_page); 1480 __u64 ev1 = md_event(sb); 1481 1482 rdev->raid_disk = -1; 1483 clear_bit(Faulty, &rdev->flags); 1484 clear_bit(In_sync, &rdev->flags); 1485 clear_bit(Bitmap_sync, &rdev->flags); 1486 clear_bit(WriteMostly, &rdev->flags); 1487 1488 if (mddev->raid_disks == 0) { 1489 mddev->major_version = 0; 1490 mddev->minor_version = sb->minor_version; 1491 mddev->patch_version = sb->patch_version; 1492 mddev->external = 0; 1493 mddev->chunk_sectors = sb->chunk_size >> 9; 1494 mddev->ctime = sb->ctime; 1495 mddev->utime = sb->utime; 1496 mddev->level = sb->level; 1497 mddev->clevel[0] = 0; 1498 mddev->layout = sb->layout; 1499 mddev->raid_disks = sb->raid_disks; 1500 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1501 mddev->events = ev1; 1502 mddev->bitmap_info.offset = 0; 1503 mddev->bitmap_info.space = 0; 1504 /* bitmap can use 60 K after the 4K superblocks */ 1505 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1506 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1507 mddev->reshape_backwards = 0; 1508 1509 if (mddev->minor_version >= 91) { 1510 mddev->reshape_position = sb->reshape_position; 1511 mddev->delta_disks = sb->delta_disks; 1512 mddev->new_level = sb->new_level; 1513 mddev->new_layout = sb->new_layout; 1514 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1515 if (mddev->delta_disks < 0) 1516 mddev->reshape_backwards = 1; 1517 } else { 1518 mddev->reshape_position = MaxSector; 1519 mddev->delta_disks = 0; 1520 mddev->new_level = mddev->level; 1521 mddev->new_layout = mddev->layout; 1522 mddev->new_chunk_sectors = mddev->chunk_sectors; 1523 } 1524 if (mddev->level == 0) 1525 mddev->layout = -1; 1526 1527 if (sb->state & (1<<MD_SB_CLEAN)) 1528 mddev->resync_offset = MaxSector; 1529 else { 1530 if (sb->events_hi == sb->cp_events_hi && 1531 sb->events_lo == sb->cp_events_lo) { 1532 mddev->resync_offset = sb->recovery_cp; 1533 } else 1534 mddev->resync_offset = 0; 1535 } 1536 1537 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1538 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1539 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1540 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1541 1542 mddev->max_disks = MD_SB_DISKS; 1543 1544 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1545 mddev->bitmap_info.file == NULL) { 1546 mddev->bitmap_info.offset = 1547 mddev->bitmap_info.default_offset; 1548 mddev->bitmap_info.space = 1549 mddev->bitmap_info.default_space; 1550 } 1551 1552 } else if (mddev->pers == NULL) { 1553 /* Insist on good event counter while assembling, except 1554 * for spares (which don't need an event count) */ 1555 ++ev1; 1556 if (sb->disks[rdev->desc_nr].state & ( 1557 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1558 if (ev1 < mddev->events) 1559 return -EINVAL; 1560 } else if (mddev->bitmap) { 1561 /* if adding to array with a bitmap, then we can accept an 1562 * older device ... but not too old. 1563 */ 1564 if (ev1 < md_bitmap_events_cleared(mddev)) 1565 return 0; 1566 if (ev1 < mddev->events) 1567 set_bit(Bitmap_sync, &rdev->flags); 1568 } else { 1569 if (ev1 < mddev->events) 1570 /* just a hot-add of a new device, leave raid_disk at -1 */ 1571 return 0; 1572 } 1573 1574 desc = sb->disks + rdev->desc_nr; 1575 1576 if (desc->state & (1<<MD_DISK_FAULTY)) 1577 set_bit(Faulty, &rdev->flags); 1578 else if (desc->state & (1<<MD_DISK_SYNC)) { 1579 set_bit(In_sync, &rdev->flags); 1580 rdev->raid_disk = desc->raid_disk; 1581 rdev->saved_raid_disk = desc->raid_disk; 1582 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1583 /* active but not in sync implies recovery up to 1584 * reshape position. We don't know exactly where 1585 * that is, so set to zero for now 1586 */ 1587 if (mddev->minor_version >= 91) { 1588 rdev->recovery_offset = 0; 1589 rdev->raid_disk = desc->raid_disk; 1590 } 1591 } 1592 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1593 set_bit(WriteMostly, &rdev->flags); 1594 if (desc->state & (1<<MD_DISK_FAILFAST)) 1595 set_bit(FailFast, &rdev->flags); 1596 return 0; 1597} 1598 1599/* 1600 * sync_super for 0.90.0 1601 */ 1602static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1603{ 1604 mdp_super_t *sb; 1605 struct md_rdev *rdev2; 1606 int next_spare = mddev->raid_disks; 1607 1608 /* make rdev->sb match mddev data.. 1609 * 1610 * 1/ zero out disks 1611 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1612 * 3/ any empty disks < next_spare become removed 1613 * 1614 * disks[0] gets initialised to REMOVED because 1615 * we cannot be sure from other fields if it has 1616 * been initialised or not. 1617 */ 1618 int i; 1619 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1620 1621 rdev->sb_size = MD_SB_BYTES; 1622 1623 sb = page_address(rdev->sb_page); 1624 1625 memset(sb, 0, sizeof(*sb)); 1626 1627 sb->md_magic = MD_SB_MAGIC; 1628 sb->major_version = mddev->major_version; 1629 sb->patch_version = mddev->patch_version; 1630 sb->gvalid_words = 0; /* ignored */ 1631 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1632 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1633 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1634 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1635 1636 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1637 sb->level = mddev->level; 1638 sb->size = mddev->dev_sectors / 2; 1639 sb->raid_disks = mddev->raid_disks; 1640 sb->md_minor = mddev->md_minor; 1641 sb->not_persistent = 0; 1642 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1643 sb->state = 0; 1644 sb->events_hi = (mddev->events>>32); 1645 sb->events_lo = (u32)mddev->events; 1646 1647 if (mddev->reshape_position == MaxSector) 1648 sb->minor_version = 90; 1649 else { 1650 sb->minor_version = 91; 1651 sb->reshape_position = mddev->reshape_position; 1652 sb->new_level = mddev->new_level; 1653 sb->delta_disks = mddev->delta_disks; 1654 sb->new_layout = mddev->new_layout; 1655 sb->new_chunk = mddev->new_chunk_sectors << 9; 1656 } 1657 mddev->minor_version = sb->minor_version; 1658 if (mddev->in_sync) 1659 { 1660 sb->recovery_cp = mddev->resync_offset; 1661 sb->cp_events_hi = (mddev->events>>32); 1662 sb->cp_events_lo = (u32)mddev->events; 1663 if (mddev->resync_offset == MaxSector) 1664 sb->state = (1<< MD_SB_CLEAN); 1665 } else 1666 sb->recovery_cp = 0; 1667 1668 sb->layout = mddev->layout; 1669 sb->chunk_size = mddev->chunk_sectors << 9; 1670 1671 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1672 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1673 1674 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1675 rdev_for_each(rdev2, mddev) { 1676 mdp_disk_t *d; 1677 int desc_nr; 1678 int is_active = test_bit(In_sync, &rdev2->flags); 1679 1680 if (rdev2->raid_disk >= 0 && 1681 sb->minor_version >= 91) 1682 /* we have nowhere to store the recovery_offset, 1683 * but if it is not below the reshape_position, 1684 * we can piggy-back on that. 1685 */ 1686 is_active = 1; 1687 if (rdev2->raid_disk < 0 || 1688 test_bit(Faulty, &rdev2->flags)) 1689 is_active = 0; 1690 if (is_active) 1691 desc_nr = rdev2->raid_disk; 1692 else 1693 desc_nr = next_spare++; 1694 rdev2->desc_nr = desc_nr; 1695 d = &sb->disks[rdev2->desc_nr]; 1696 nr_disks++; 1697 d->number = rdev2->desc_nr; 1698 d->major = MAJOR(rdev2->bdev->bd_dev); 1699 d->minor = MINOR(rdev2->bdev->bd_dev); 1700 if (is_active) 1701 d->raid_disk = rdev2->raid_disk; 1702 else 1703 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1704 if (test_bit(Faulty, &rdev2->flags)) 1705 d->state = (1<<MD_DISK_FAULTY); 1706 else if (is_active) { 1707 d->state = (1<<MD_DISK_ACTIVE); 1708 if (test_bit(In_sync, &rdev2->flags)) 1709 d->state |= (1<<MD_DISK_SYNC); 1710 active++; 1711 working++; 1712 } else { 1713 d->state = 0; 1714 spare++; 1715 working++; 1716 } 1717 if (test_bit(WriteMostly, &rdev2->flags)) 1718 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1719 if (test_bit(FailFast, &rdev2->flags)) 1720 d->state |= (1<<MD_DISK_FAILFAST); 1721 } 1722 /* now set the "removed" and "faulty" bits on any missing devices */ 1723 for (i=0 ; i < mddev->raid_disks ; i++) { 1724 mdp_disk_t *d = &sb->disks[i]; 1725 if (d->state == 0 && d->number == 0) { 1726 d->number = i; 1727 d->raid_disk = i; 1728 d->state = (1<<MD_DISK_REMOVED); 1729 d->state |= (1<<MD_DISK_FAULTY); 1730 failed++; 1731 } 1732 } 1733 sb->nr_disks = nr_disks; 1734 sb->active_disks = active; 1735 sb->working_disks = working; 1736 sb->failed_disks = failed; 1737 sb->spare_disks = spare; 1738 1739 sb->this_disk = sb->disks[rdev->desc_nr]; 1740 sb->sb_csum = calc_sb_csum(sb); 1741} 1742 1743/* 1744 * rdev_size_change for 0.90.0 1745 */ 1746static unsigned long long 1747super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1748{ 1749 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1750 return 0; /* component must fit device */ 1751 if (rdev->mddev->bitmap_info.offset) 1752 return 0; /* can't move bitmap */ 1753 rdev->sb_start = calc_dev_sboffset(rdev); 1754 if (!num_sectors || num_sectors > rdev->sb_start) 1755 num_sectors = rdev->sb_start; 1756 /* Limit to 4TB as metadata cannot record more than that. 1757 * 4TB == 2^32 KB, or 2*2^32 sectors. 1758 */ 1759 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1760 num_sectors = (sector_t)(2ULL << 32) - 2; 1761 do { 1762 md_write_metadata(rdev->mddev, rdev, rdev->sb_start, 1763 rdev->sb_size, rdev->sb_page, 0); 1764 } while (md_super_wait(rdev->mddev) < 0); 1765 return num_sectors; 1766} 1767 1768static int 1769super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1770{ 1771 /* non-zero offset changes not possible with v0.90 */ 1772 return new_offset == 0; 1773} 1774 1775/* 1776 * version 1 superblock 1777 */ 1778 1779static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1780{ 1781 __le32 disk_csum; 1782 u32 csum; 1783 unsigned long long newcsum; 1784 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1785 __le32 *isuper = (__le32*)sb; 1786 1787 disk_csum = sb->sb_csum; 1788 sb->sb_csum = 0; 1789 newcsum = 0; 1790 for (; size >= 4; size -= 4) 1791 newcsum += le32_to_cpu(*isuper++); 1792 1793 if (size == 2) 1794 newcsum += le16_to_cpu(*(__le16*) isuper); 1795 1796 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1797 sb->sb_csum = disk_csum; 1798 return cpu_to_le32(csum); 1799} 1800 1801static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1802{ 1803 struct mdp_superblock_1 *sb; 1804 int ret; 1805 sector_t sb_start; 1806 sector_t sectors; 1807 int bmask; 1808 bool spare_disk = true; 1809 1810 /* 1811 * Calculate the position of the superblock in 512byte sectors. 1812 * It is always aligned to a 4K boundary and 1813 * depeding on minor_version, it can be: 1814 * 0: At least 8K, but less than 12K, from end of device 1815 * 1: At start of device 1816 * 2: 4K from start of device. 1817 */ 1818 switch(minor_version) { 1819 case 0: 1820 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1821 sb_start &= ~(sector_t)(4*2-1); 1822 break; 1823 case 1: 1824 sb_start = 0; 1825 break; 1826 case 2: 1827 sb_start = 8; 1828 break; 1829 default: 1830 return -EINVAL; 1831 } 1832 rdev->sb_start = sb_start; 1833 1834 /* superblock is rarely larger than 1K, but it can be larger, 1835 * and it is safe to read 4k, so we do that 1836 */ 1837 ret = read_disk_sb(rdev, 4096); 1838 if (ret) return ret; 1839 1840 sb = page_address(rdev->sb_page); 1841 1842 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1843 sb->major_version != cpu_to_le32(1) || 1844 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1845 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1846 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1847 return -EINVAL; 1848 1849 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1850 pr_warn("md: invalid superblock checksum on %pg\n", 1851 rdev->bdev); 1852 return -EINVAL; 1853 } 1854 if (le64_to_cpu(sb->data_size) < 10) { 1855 pr_warn("md: data_size too small on %pg\n", 1856 rdev->bdev); 1857 return -EINVAL; 1858 } 1859 if (sb->pad0 || 1860 sb->pad3[0] || 1861 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) { 1862 pr_warn("Some padding is non-zero on %pg, might be a new feature\n", 1863 rdev->bdev); 1864 if (check_new_feature) 1865 return -EINVAL; 1866 pr_warn("check_new_feature is disabled, data corruption possible\n"); 1867 } 1868 1869 rdev->preferred_minor = 0xffff; 1870 rdev->data_offset = le64_to_cpu(sb->data_offset); 1871 rdev->new_data_offset = rdev->data_offset; 1872 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1873 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1874 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1875 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1876 1877 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1878 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1879 if (rdev->sb_size & bmask) 1880 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1881 1882 if (minor_version 1883 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1884 return -EINVAL; 1885 if (minor_version 1886 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1887 return -EINVAL; 1888 1889 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1890 1891 if (!rdev->bb_page) { 1892 rdev->bb_page = alloc_page(GFP_KERNEL); 1893 if (!rdev->bb_page) 1894 return -ENOMEM; 1895 } 1896 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1897 rdev->badblocks.count == 0) { 1898 /* need to load the bad block list. 1899 * Currently we limit it to one page. 1900 */ 1901 s32 offset; 1902 sector_t bb_sector; 1903 __le64 *bbp; 1904 int i; 1905 int sectors = le16_to_cpu(sb->bblog_size); 1906 if (sectors > (PAGE_SIZE / 512)) 1907 return -EINVAL; 1908 offset = le32_to_cpu(sb->bblog_offset); 1909 if (offset == 0) 1910 return -EINVAL; 1911 bb_sector = (long long)offset; 1912 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1913 rdev->bb_page, REQ_OP_READ, true)) 1914 return -EIO; 1915 bbp = (__le64 *)page_address(rdev->bb_page); 1916 rdev->badblocks.shift = sb->bblog_shift; 1917 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1918 u64 bb = le64_to_cpu(*bbp); 1919 int count = bb & (0x3ff); 1920 u64 sector = bb >> 10; 1921 sector <<= sb->bblog_shift; 1922 count <<= sb->bblog_shift; 1923 if (bb + 1 == 0) 1924 break; 1925 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1926 return -EINVAL; 1927 } 1928 } else if (sb->bblog_offset != 0) 1929 rdev->badblocks.shift = 0; 1930 1931 if ((le32_to_cpu(sb->feature_map) & 1932 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1933 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1934 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1935 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1936 } 1937 1938 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1939 sb->level != 0) 1940 return -EINVAL; 1941 1942 /* not spare disk */ 1943 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1944 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1945 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1946 spare_disk = false; 1947 1948 if (!refdev) { 1949 if (!spare_disk) 1950 ret = 1; 1951 else 1952 ret = 0; 1953 } else { 1954 __u64 ev1, ev2; 1955 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1956 1957 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1958 sb->level != refsb->level || 1959 sb->layout != refsb->layout || 1960 sb->chunksize != refsb->chunksize) { 1961 pr_warn("md: %pg has strangely different superblock to %pg\n", 1962 rdev->bdev, 1963 refdev->bdev); 1964 return -EINVAL; 1965 } 1966 ev1 = le64_to_cpu(sb->events); 1967 ev2 = le64_to_cpu(refsb->events); 1968 1969 if (!spare_disk && ev1 > ev2) 1970 ret = 1; 1971 else 1972 ret = 0; 1973 } 1974 if (minor_version) 1975 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1976 else 1977 sectors = rdev->sb_start; 1978 if (sectors < le64_to_cpu(sb->data_size)) 1979 return -EINVAL; 1980 rdev->sectors = le64_to_cpu(sb->data_size); 1981 return ret; 1982} 1983 1984static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1985{ 1986 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1987 __u64 ev1 = le64_to_cpu(sb->events); 1988 int role; 1989 1990 rdev->raid_disk = -1; 1991 clear_bit(Faulty, &rdev->flags); 1992 clear_bit(In_sync, &rdev->flags); 1993 clear_bit(Bitmap_sync, &rdev->flags); 1994 clear_bit(WriteMostly, &rdev->flags); 1995 1996 if (mddev->raid_disks == 0) { 1997 mddev->major_version = 1; 1998 mddev->patch_version = 0; 1999 mddev->external = 0; 2000 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 2001 mddev->ctime = le64_to_cpu(sb->ctime); 2002 mddev->utime = le64_to_cpu(sb->utime); 2003 mddev->level = le32_to_cpu(sb->level); 2004 mddev->clevel[0] = 0; 2005 mddev->layout = le32_to_cpu(sb->layout); 2006 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 2007 mddev->dev_sectors = le64_to_cpu(sb->size); 2008 mddev->events = ev1; 2009 mddev->bitmap_info.offset = 0; 2010 mddev->bitmap_info.space = 0; 2011 /* Default location for bitmap is 1K after superblock 2012 * using 3K - total of 4K 2013 */ 2014 mddev->bitmap_info.default_offset = 1024 >> 9; 2015 mddev->bitmap_info.default_space = (4096-1024) >> 9; 2016 mddev->reshape_backwards = 0; 2017 2018 mddev->resync_offset = le64_to_cpu(sb->resync_offset); 2019 memcpy(mddev->uuid, sb->set_uuid, 16); 2020 2021 mddev->max_disks = (4096-256)/2; 2022 2023 if (!mddev->logical_block_size) 2024 mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); 2025 2026 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 2027 mddev->bitmap_info.file == NULL) { 2028 mddev->bitmap_info.offset = 2029 (__s32)le32_to_cpu(sb->bitmap_offset); 2030 /* Metadata doesn't record how much space is available. 2031 * For 1.0, we assume we can use up to the superblock 2032 * if before, else to 4K beyond superblock. 2033 * For others, assume no change is possible. 2034 */ 2035 if (mddev->minor_version > 0) 2036 mddev->bitmap_info.space = 0; 2037 else if (mddev->bitmap_info.offset > 0) 2038 mddev->bitmap_info.space = 2039 8 - mddev->bitmap_info.offset; 2040 else 2041 mddev->bitmap_info.space = 2042 -mddev->bitmap_info.offset; 2043 } 2044 2045 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 2046 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 2047 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 2048 mddev->new_level = le32_to_cpu(sb->new_level); 2049 mddev->new_layout = le32_to_cpu(sb->new_layout); 2050 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 2051 if (mddev->delta_disks < 0 || 2052 (mddev->delta_disks == 0 && 2053 (le32_to_cpu(sb->feature_map) 2054 & MD_FEATURE_RESHAPE_BACKWARDS))) 2055 mddev->reshape_backwards = 1; 2056 } else { 2057 mddev->reshape_position = MaxSector; 2058 mddev->delta_disks = 0; 2059 mddev->new_level = mddev->level; 2060 mddev->new_layout = mddev->layout; 2061 mddev->new_chunk_sectors = mddev->chunk_sectors; 2062 } 2063 2064 if (mddev->level == 0 && 2065 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 2066 mddev->layout = -1; 2067 2068 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 2069 set_bit(MD_HAS_JOURNAL, &mddev->flags); 2070 2071 if (le32_to_cpu(sb->feature_map) & 2072 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 2073 if (le32_to_cpu(sb->feature_map) & 2074 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 2075 return -EINVAL; 2076 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 2077 (le32_to_cpu(sb->feature_map) & 2078 MD_FEATURE_MULTIPLE_PPLS)) 2079 return -EINVAL; 2080 set_bit(MD_HAS_PPL, &mddev->flags); 2081 } 2082 } else if (mddev->pers == NULL) { 2083 /* Insist of good event counter while assembling, except for 2084 * spares (which don't need an event count). 2085 * Similar to mdadm, we allow event counter difference of 1 2086 * from the freshest device. 2087 */ 2088 if (rdev->desc_nr >= 0 && 2089 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 2090 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 2091 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 2092 if (ev1 + 1 < mddev->events) 2093 return -EINVAL; 2094 } else if (mddev->bitmap) { 2095 /* If adding to array with a bitmap, then we can accept an 2096 * older device, but not too old. 2097 */ 2098 if (ev1 < md_bitmap_events_cleared(mddev)) 2099 return 0; 2100 if (ev1 < mddev->events) 2101 set_bit(Bitmap_sync, &rdev->flags); 2102 } else { 2103 if (ev1 < mddev->events) 2104 /* just a hot-add of a new device, leave raid_disk at -1 */ 2105 return 0; 2106 } 2107 2108 if (rdev->desc_nr < 0 || 2109 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 2110 role = MD_DISK_ROLE_SPARE; 2111 rdev->desc_nr = -1; 2112 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 2113 /* 2114 * If we are assembling, and our event counter is smaller than the 2115 * highest event counter, we cannot trust our superblock about the role. 2116 * It could happen that our rdev was marked as Faulty, and all other 2117 * superblocks were updated with +1 event counter. 2118 * Then, before the next superblock update, which typically happens when 2119 * remove_and_add_spares() removes the device from the array, there was 2120 * a crash or reboot. 2121 * If we allow current rdev without consulting the freshest superblock, 2122 * we could cause data corruption. 2123 * Note that in this case our event counter is smaller by 1 than the 2124 * highest, otherwise, this rdev would not be allowed into array; 2125 * both kernel and mdadm allow event counter difference of 1. 2126 */ 2127 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2128 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2129 2130 if (rdev->desc_nr >= freshest_max_dev) { 2131 /* this is unexpected, better not proceed */ 2132 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2133 mdname(mddev), rdev->bdev, rdev->desc_nr, 2134 freshest->bdev, freshest_max_dev); 2135 return -EUCLEAN; 2136 } 2137 2138 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2139 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2140 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2141 } else { 2142 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2143 } 2144 switch (role) { 2145 case MD_DISK_ROLE_SPARE: /* spare */ 2146 break; 2147 case MD_DISK_ROLE_FAULTY: /* faulty */ 2148 set_bit(Faulty, &rdev->flags); 2149 break; 2150 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2151 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2152 /* journal device without journal feature */ 2153 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2154 return -EINVAL; 2155 } 2156 set_bit(Journal, &rdev->flags); 2157 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2158 rdev->raid_disk = 0; 2159 break; 2160 default: 2161 rdev->saved_raid_disk = role; 2162 if ((le32_to_cpu(sb->feature_map) & 2163 MD_FEATURE_RECOVERY_OFFSET)) { 2164 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2165 if (!(le32_to_cpu(sb->feature_map) & 2166 MD_FEATURE_RECOVERY_BITMAP)) 2167 rdev->saved_raid_disk = -1; 2168 } else { 2169 /* 2170 * If the array is FROZEN, then the device can't 2171 * be in_sync with rest of array. 2172 */ 2173 if (!test_bit(MD_RECOVERY_FROZEN, 2174 &mddev->recovery)) 2175 set_bit(In_sync, &rdev->flags); 2176 } 2177 rdev->raid_disk = role; 2178 break; 2179 } 2180 if (sb->devflags & WriteMostly1) 2181 set_bit(WriteMostly, &rdev->flags); 2182 if (sb->devflags & FailFast1) 2183 set_bit(FailFast, &rdev->flags); 2184 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2185 set_bit(Replacement, &rdev->flags); 2186 2187 return 0; 2188} 2189 2190static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2191{ 2192 struct mdp_superblock_1 *sb; 2193 struct md_rdev *rdev2; 2194 int max_dev, i; 2195 /* make rdev->sb match mddev and rdev data. */ 2196 2197 sb = page_address(rdev->sb_page); 2198 2199 sb->feature_map = 0; 2200 sb->pad0 = 0; 2201 sb->recovery_offset = cpu_to_le64(0); 2202 memset(sb->pad3, 0, sizeof(sb->pad3)); 2203 2204 sb->utime = cpu_to_le64((__u64)mddev->utime); 2205 sb->events = cpu_to_le64(mddev->events); 2206 if (mddev->in_sync) 2207 sb->resync_offset = cpu_to_le64(mddev->resync_offset); 2208 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2209 sb->resync_offset = cpu_to_le64(MaxSector); 2210 else 2211 sb->resync_offset = cpu_to_le64(0); 2212 2213 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2214 2215 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2216 sb->size = cpu_to_le64(mddev->dev_sectors); 2217 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2218 sb->level = cpu_to_le32(mddev->level); 2219 sb->layout = cpu_to_le32(mddev->layout); 2220 sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); 2221 if (test_bit(FailFast, &rdev->flags)) 2222 sb->devflags |= FailFast1; 2223 else 2224 sb->devflags &= ~FailFast1; 2225 2226 if (test_bit(WriteMostly, &rdev->flags)) 2227 sb->devflags |= WriteMostly1; 2228 else 2229 sb->devflags &= ~WriteMostly1; 2230 sb->data_offset = cpu_to_le64(rdev->data_offset); 2231 sb->data_size = cpu_to_le64(rdev->sectors); 2232 2233 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2234 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2235 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2236 } 2237 2238 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2239 !test_bit(In_sync, &rdev->flags)) { 2240 sb->feature_map |= 2241 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2242 sb->recovery_offset = 2243 cpu_to_le64(rdev->recovery_offset); 2244 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2245 sb->feature_map |= 2246 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2247 } 2248 /* Note: recovery_offset and journal_tail share space */ 2249 if (test_bit(Journal, &rdev->flags)) 2250 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2251 if (test_bit(Replacement, &rdev->flags)) 2252 sb->feature_map |= 2253 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2254 2255 if (mddev->reshape_position != MaxSector) { 2256 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2257 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2258 sb->new_layout = cpu_to_le32(mddev->new_layout); 2259 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2260 sb->new_level = cpu_to_le32(mddev->new_level); 2261 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2262 if (mddev->delta_disks == 0 && 2263 mddev->reshape_backwards) 2264 sb->feature_map 2265 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2266 if (rdev->new_data_offset != rdev->data_offset) { 2267 sb->feature_map 2268 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2269 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2270 - rdev->data_offset)); 2271 } 2272 } 2273 2274 if (mddev_is_clustered(mddev)) 2275 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2276 2277 if (rdev->badblocks.count == 0) 2278 /* Nothing to do for bad blocks*/ ; 2279 else if (sb->bblog_offset == 0) 2280 /* Cannot record bad blocks on this device */ 2281 md_error(mddev, rdev); 2282 else { 2283 struct badblocks *bb = &rdev->badblocks; 2284 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2285 u64 *p = bb->page; 2286 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2287 if (bb->changed) { 2288 unsigned seq; 2289 2290retry: 2291 seq = read_seqbegin(&bb->lock); 2292 2293 memset(bbp, 0xff, PAGE_SIZE); 2294 2295 for (i = 0 ; i < bb->count ; i++) { 2296 u64 internal_bb = p[i]; 2297 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2298 | BB_LEN(internal_bb)); 2299 bbp[i] = cpu_to_le64(store_bb); 2300 } 2301 bb->changed = 0; 2302 if (read_seqretry(&bb->lock, seq)) 2303 goto retry; 2304 2305 bb->sector = (rdev->sb_start + 2306 (int)le32_to_cpu(sb->bblog_offset)); 2307 bb->size = le16_to_cpu(sb->bblog_size); 2308 } 2309 } 2310 2311 max_dev = 0; 2312 rdev_for_each(rdev2, mddev) 2313 if (rdev2->desc_nr+1 > max_dev) 2314 max_dev = rdev2->desc_nr+1; 2315 2316 if (max_dev > le32_to_cpu(sb->max_dev)) { 2317 int bmask; 2318 sb->max_dev = cpu_to_le32(max_dev); 2319 rdev->sb_size = max_dev * 2 + 256; 2320 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2321 if (rdev->sb_size & bmask) 2322 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2323 } else 2324 max_dev = le32_to_cpu(sb->max_dev); 2325 2326 for (i=0; i<max_dev;i++) 2327 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2328 2329 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2330 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2331 2332 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2333 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2334 sb->feature_map |= 2335 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2336 else 2337 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2338 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2339 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2340 } 2341 2342 rdev_for_each(rdev2, mddev) { 2343 i = rdev2->desc_nr; 2344 if (test_bit(Faulty, &rdev2->flags)) 2345 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2346 else if (test_bit(In_sync, &rdev2->flags)) 2347 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2348 else if (test_bit(Journal, &rdev2->flags)) 2349 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2350 else if (rdev2->raid_disk >= 0) 2351 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2352 else 2353 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2354 } 2355 2356 sb->sb_csum = calc_sb_1_csum(sb); 2357} 2358 2359static sector_t super_1_choose_bm_space(sector_t dev_size) 2360{ 2361 sector_t bm_space; 2362 2363 /* if the device is bigger than 8Gig, save 64k for bitmap 2364 * usage, if bigger than 200Gig, save 128k 2365 */ 2366 if (dev_size < 64*2) 2367 bm_space = 0; 2368 else if (dev_size - 64*2 >= 200*1024*1024*2) 2369 bm_space = 128*2; 2370 else if (dev_size - 4*2 > 8*1024*1024*2) 2371 bm_space = 64*2; 2372 else 2373 bm_space = 4*2; 2374 return bm_space; 2375} 2376 2377static unsigned long long 2378super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2379{ 2380 struct mdp_superblock_1 *sb; 2381 sector_t max_sectors; 2382 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2383 return 0; /* component must fit device */ 2384 if (rdev->data_offset != rdev->new_data_offset) 2385 return 0; /* too confusing */ 2386 if (rdev->sb_start < rdev->data_offset) { 2387 /* minor versions 1 and 2; superblock before data */ 2388 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2389 if (!num_sectors || num_sectors > max_sectors) 2390 num_sectors = max_sectors; 2391 } else if (rdev->mddev->bitmap_info.offset) { 2392 /* minor version 0 with bitmap we can't move */ 2393 return 0; 2394 } else { 2395 /* minor version 0; superblock after data */ 2396 sector_t sb_start, bm_space; 2397 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2398 2399 /* 8K is for superblock */ 2400 sb_start = dev_size - 8*2; 2401 sb_start &= ~(sector_t)(4*2 - 1); 2402 2403 bm_space = super_1_choose_bm_space(dev_size); 2404 2405 /* Space that can be used to store date needs to decrease 2406 * superblock bitmap space and bad block space(4K) 2407 */ 2408 max_sectors = sb_start - bm_space - 4*2; 2409 2410 if (!num_sectors || num_sectors > max_sectors) 2411 num_sectors = max_sectors; 2412 rdev->sb_start = sb_start; 2413 } 2414 sb = page_address(rdev->sb_page); 2415 sb->data_size = cpu_to_le64(num_sectors); 2416 sb->super_offset = cpu_to_le64(rdev->sb_start); 2417 sb->sb_csum = calc_sb_1_csum(sb); 2418 do { 2419 md_write_metadata(rdev->mddev, rdev, rdev->sb_start, 2420 rdev->sb_size, rdev->sb_page, 0); 2421 } while (md_super_wait(rdev->mddev) < 0); 2422 return num_sectors; 2423 2424} 2425 2426static int 2427super_1_allow_new_offset(struct md_rdev *rdev, 2428 unsigned long long new_offset) 2429{ 2430 struct mddev *mddev = rdev->mddev; 2431 2432 /* All necessary checks on new >= old have been done */ 2433 if (new_offset >= rdev->data_offset) 2434 return 1; 2435 2436 /* with 1.0 metadata, there is no metadata to tread on 2437 * so we can always move back */ 2438 if (mddev->minor_version == 0) 2439 return 1; 2440 2441 /* otherwise we must be sure not to step on 2442 * any metadata, so stay: 2443 * 36K beyond start of superblock 2444 * beyond end of badblocks 2445 * beyond write-intent bitmap 2446 */ 2447 if (rdev->sb_start + (32+4)*2 > new_offset) 2448 return 0; 2449 2450 if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { 2451 struct md_bitmap_stats stats; 2452 int err; 2453 2454 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2455 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2456 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2457 return 0; 2458 } 2459 2460 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2461 return 0; 2462 2463 return 1; 2464} 2465 2466static struct super_type super_types[] = { 2467 [0] = { 2468 .name = "0.90.0", 2469 .owner = THIS_MODULE, 2470 .load_super = super_90_load, 2471 .validate_super = super_90_validate, 2472 .sync_super = super_90_sync, 2473 .rdev_size_change = super_90_rdev_size_change, 2474 .allow_new_offset = super_90_allow_new_offset, 2475 }, 2476 [1] = { 2477 .name = "md-1", 2478 .owner = THIS_MODULE, 2479 .load_super = super_1_load, 2480 .validate_super = super_1_validate, 2481 .sync_super = super_1_sync, 2482 .rdev_size_change = super_1_rdev_size_change, 2483 .allow_new_offset = super_1_allow_new_offset, 2484 }, 2485}; 2486 2487static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2488{ 2489 if (mddev->sync_super) { 2490 mddev->sync_super(mddev, rdev); 2491 return; 2492 } 2493 2494 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2495 2496 super_types[mddev->major_version].sync_super(mddev, rdev); 2497} 2498 2499static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2500{ 2501 struct md_rdev *rdev, *rdev2; 2502 2503 rcu_read_lock(); 2504 rdev_for_each_rcu(rdev, mddev1) { 2505 if (test_bit(Faulty, &rdev->flags) || 2506 test_bit(Journal, &rdev->flags) || 2507 rdev->raid_disk == -1) 2508 continue; 2509 rdev_for_each_rcu(rdev2, mddev2) { 2510 if (test_bit(Faulty, &rdev2->flags) || 2511 test_bit(Journal, &rdev2->flags) || 2512 rdev2->raid_disk == -1) 2513 continue; 2514 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2515 rcu_read_unlock(); 2516 return 1; 2517 } 2518 } 2519 } 2520 rcu_read_unlock(); 2521 return 0; 2522} 2523 2524static LIST_HEAD(pending_raid_disks); 2525 2526/* 2527 * Try to register data integrity profile for an mddev 2528 * 2529 * This is called when an array is started and after a disk has been kicked 2530 * from the array. It only succeeds if all working and active component devices 2531 * are integrity capable with matching profiles. 2532 */ 2533int md_integrity_register(struct mddev *mddev) 2534{ 2535 if (list_empty(&mddev->disks)) 2536 return 0; /* nothing to do */ 2537 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2538 return 0; /* shouldn't register */ 2539 2540 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2541 return 0; 2542} 2543EXPORT_SYMBOL(md_integrity_register); 2544 2545static bool rdev_read_only(struct md_rdev *rdev) 2546{ 2547 return bdev_read_only(rdev->bdev) || 2548 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2549} 2550 2551static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2552{ 2553 char b[BDEVNAME_SIZE]; 2554 int err; 2555 2556 /* prevent duplicates */ 2557 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2558 return -EEXIST; 2559 2560 if (rdev_read_only(rdev) && mddev->pers) 2561 return -EROFS; 2562 2563 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2564 if (!test_bit(Journal, &rdev->flags) && 2565 rdev->sectors && 2566 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2567 if (mddev->pers) { 2568 /* Cannot change size, so fail 2569 * If mddev->level <= 0, then we don't care 2570 * about aligning sizes (e.g. linear) 2571 */ 2572 if (mddev->level > 0) 2573 return -ENOSPC; 2574 } else 2575 mddev->dev_sectors = rdev->sectors; 2576 } 2577 2578 /* Verify rdev->desc_nr is unique. 2579 * If it is -1, assign a free number, else 2580 * check number is not in use 2581 */ 2582 rcu_read_lock(); 2583 if (rdev->desc_nr < 0) { 2584 int choice = 0; 2585 if (mddev->pers) 2586 choice = mddev->raid_disks; 2587 while (md_find_rdev_nr_rcu(mddev, choice)) 2588 choice++; 2589 rdev->desc_nr = choice; 2590 } else { 2591 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2592 rcu_read_unlock(); 2593 return -EBUSY; 2594 } 2595 } 2596 rcu_read_unlock(); 2597 if (!test_bit(Journal, &rdev->flags) && 2598 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2599 pr_warn("md: %s: array is limited to %d devices\n", 2600 mdname(mddev), mddev->max_disks); 2601 return -EBUSY; 2602 } 2603 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2604 strreplace(b, '/', '!'); 2605 2606 rdev->mddev = mddev; 2607 pr_debug("md: bind<%s>\n", b); 2608 2609 if (mddev->raid_disks) 2610 mddev_create_serial_pool(mddev, rdev); 2611 2612 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2613 goto fail; 2614 2615 /* failure here is OK */ 2616 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2617 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2618 rdev->sysfs_unack_badblocks = 2619 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2620 rdev->sysfs_badblocks = 2621 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2622 2623 list_add_rcu(&rdev->same_set, &mddev->disks); 2624 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2625 2626 return 0; 2627 2628 fail: 2629 pr_warn("md: failed to register dev-%s for %s\n", 2630 b, mdname(mddev)); 2631 mddev_destroy_serial_pool(mddev, rdev); 2632 return err; 2633} 2634 2635void md_autodetect_dev(dev_t dev); 2636 2637/* just for claiming the bdev */ 2638static struct md_rdev claim_rdev; 2639 2640static void export_rdev(struct md_rdev *rdev) 2641{ 2642 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2643 md_rdev_clear(rdev); 2644#ifndef MODULE 2645 if (test_bit(AutoDetected, &rdev->flags)) 2646 md_autodetect_dev(rdev->bdev->bd_dev); 2647#endif 2648 fput(rdev->bdev_file); 2649 rdev->bdev = NULL; 2650 kobject_put(&rdev->kobj); 2651} 2652 2653static void md_kick_rdev_from_array(struct md_rdev *rdev) 2654{ 2655 struct mddev *mddev = rdev->mddev; 2656 2657 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2658 list_del_rcu(&rdev->same_set); 2659 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2660 mddev_destroy_serial_pool(rdev->mddev, rdev); 2661 WRITE_ONCE(rdev->mddev, NULL); 2662 sysfs_remove_link(&rdev->kobj, "block"); 2663 sysfs_put(rdev->sysfs_state); 2664 sysfs_put(rdev->sysfs_unack_badblocks); 2665 sysfs_put(rdev->sysfs_badblocks); 2666 rdev->sysfs_state = NULL; 2667 rdev->sysfs_unack_badblocks = NULL; 2668 rdev->sysfs_badblocks = NULL; 2669 rdev->badblocks.count = 0; 2670 2671 synchronize_rcu(); 2672 2673 /* 2674 * kobject_del() will wait for all in progress writers to be done, where 2675 * reconfig_mutex is held, hence it can't be called under 2676 * reconfig_mutex and it's delayed to mddev_unlock(). 2677 */ 2678 list_add(&rdev->same_set, &mddev->deleting); 2679} 2680 2681static void export_array(struct mddev *mddev) 2682{ 2683 struct md_rdev *rdev; 2684 2685 while (!list_empty(&mddev->disks)) { 2686 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2687 same_set); 2688 md_kick_rdev_from_array(rdev); 2689 } 2690 mddev->raid_disks = 0; 2691 mddev->major_version = 0; 2692} 2693 2694static bool set_in_sync(struct mddev *mddev) 2695{ 2696 lockdep_assert_held(&mddev->lock); 2697 if (!mddev->in_sync) { 2698 mddev->sync_checkers++; 2699 spin_unlock(&mddev->lock); 2700 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2701 spin_lock(&mddev->lock); 2702 if (!mddev->in_sync && 2703 percpu_ref_is_zero(&mddev->writes_pending)) { 2704 mddev->in_sync = 1; 2705 /* 2706 * Ensure ->in_sync is visible before we clear 2707 * ->sync_checkers. 2708 */ 2709 smp_mb(); 2710 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2711 sysfs_notify_dirent_safe(mddev->sysfs_state); 2712 } 2713 if (--mddev->sync_checkers == 0) 2714 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2715 } 2716 if (mddev->safemode == 1) 2717 mddev->safemode = 0; 2718 return mddev->in_sync; 2719} 2720 2721static void sync_sbs(struct mddev *mddev, int nospares) 2722{ 2723 /* Update each superblock (in-memory image), but 2724 * if we are allowed to, skip spares which already 2725 * have the right event counter, or have one earlier 2726 * (which would mean they aren't being marked as dirty 2727 * with the rest of the array) 2728 */ 2729 struct md_rdev *rdev; 2730 rdev_for_each(rdev, mddev) { 2731 if (rdev->sb_events == mddev->events || 2732 (nospares && 2733 rdev->raid_disk < 0 && 2734 rdev->sb_events+1 == mddev->events)) { 2735 /* Don't update this superblock */ 2736 rdev->sb_loaded = 2; 2737 } else { 2738 sync_super(mddev, rdev); 2739 rdev->sb_loaded = 1; 2740 } 2741 } 2742} 2743 2744static bool does_sb_need_changing(struct mddev *mddev) 2745{ 2746 struct md_rdev *rdev = NULL, *iter; 2747 struct mdp_superblock_1 *sb; 2748 int role; 2749 2750 /* Find a good rdev */ 2751 rdev_for_each(iter, mddev) 2752 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2753 rdev = iter; 2754 break; 2755 } 2756 2757 /* No good device found. */ 2758 if (!rdev) 2759 return false; 2760 2761 sb = page_address(rdev->sb_page); 2762 /* Check if a device has become faulty or a spare become active */ 2763 rdev_for_each(rdev, mddev) { 2764 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2765 /* Device activated? */ 2766 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2767 !test_bit(Faulty, &rdev->flags)) 2768 return true; 2769 /* Device turned faulty? */ 2770 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2771 return true; 2772 } 2773 2774 /* Check if any mddev parameters have changed */ 2775 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2776 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2777 (mddev->layout != le32_to_cpu(sb->layout)) || 2778 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2779 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2780 return true; 2781 2782 return false; 2783} 2784 2785void md_update_sb(struct mddev *mddev, int force_change) 2786{ 2787 struct md_rdev *rdev; 2788 int sync_req; 2789 int nospares = 0; 2790 int any_badblocks_changed = 0; 2791 int ret = -1; 2792 2793 if (!md_is_rdwr(mddev)) { 2794 if (force_change) 2795 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2796 if (!mddev_is_dm(mddev)) 2797 pr_err_ratelimited("%s: can't update sb for read-only array %s\n", 2798 __func__, mdname(mddev)); 2799 return; 2800 } 2801 2802repeat: 2803 if (mddev_is_clustered(mddev)) { 2804 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2805 force_change = 1; 2806 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2807 nospares = 1; 2808 ret = mddev->cluster_ops->metadata_update_start(mddev); 2809 /* Has someone else has updated the sb */ 2810 if (!does_sb_need_changing(mddev)) { 2811 if (ret == 0) 2812 mddev->cluster_ops->metadata_update_cancel(mddev); 2813 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2814 BIT(MD_SB_CHANGE_DEVS) | 2815 BIT(MD_SB_CHANGE_CLEAN)); 2816 return; 2817 } 2818 } 2819 2820 /* 2821 * First make sure individual recovery_offsets are correct 2822 * curr_resync_completed can only be used during recovery. 2823 * During reshape/resync it might use array-addresses rather 2824 * that device addresses. 2825 */ 2826 rdev_for_each(rdev, mddev) { 2827 if (rdev->raid_disk >= 0 && 2828 mddev->delta_disks >= 0 && 2829 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2830 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2831 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2832 !test_bit(Journal, &rdev->flags) && 2833 !test_bit(In_sync, &rdev->flags) && 2834 mddev->curr_resync_completed > rdev->recovery_offset) 2835 rdev->recovery_offset = mddev->curr_resync_completed; 2836 2837 } 2838 if (!mddev->persistent) { 2839 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2840 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2841 if (!mddev->external) { 2842 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2843 rdev_for_each(rdev, mddev) { 2844 if (rdev->badblocks.changed) { 2845 rdev->badblocks.changed = 0; 2846 ack_all_badblocks(&rdev->badblocks); 2847 md_error(mddev, rdev); 2848 } 2849 clear_bit(Blocked, &rdev->flags); 2850 clear_bit(BlockedBadBlocks, &rdev->flags); 2851 wake_up(&rdev->blocked_wait); 2852 } 2853 } 2854 wake_up(&mddev->sb_wait); 2855 return; 2856 } 2857 2858 spin_lock(&mddev->lock); 2859 2860 mddev->utime = ktime_get_real_seconds(); 2861 2862 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2863 force_change = 1; 2864 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2865 /* just a clean<-> dirty transition, possibly leave spares alone, 2866 * though if events isn't the right even/odd, we will have to do 2867 * spares after all 2868 */ 2869 nospares = 1; 2870 if (force_change) 2871 nospares = 0; 2872 if (mddev->degraded) 2873 /* If the array is degraded, then skipping spares is both 2874 * dangerous and fairly pointless. 2875 * Dangerous because a device that was removed from the array 2876 * might have a event_count that still looks up-to-date, 2877 * so it can be re-added without a resync. 2878 * Pointless because if there are any spares to skip, 2879 * then a recovery will happen and soon that array won't 2880 * be degraded any more and the spare can go back to sleep then. 2881 */ 2882 nospares = 0; 2883 2884 sync_req = mddev->in_sync; 2885 2886 /* If this is just a dirty<->clean transition, and the array is clean 2887 * and 'events' is odd, we can roll back to the previous clean state */ 2888 if (nospares 2889 && (mddev->in_sync && mddev->resync_offset == MaxSector) 2890 && mddev->can_decrease_events 2891 && mddev->events != 1) { 2892 mddev->events--; 2893 mddev->can_decrease_events = 0; 2894 } else { 2895 /* otherwise we have to go forward and ... */ 2896 mddev->events ++; 2897 mddev->can_decrease_events = nospares; 2898 } 2899 2900 /* 2901 * This 64-bit counter should never wrap. 2902 * Either we are in around ~1 trillion A.C., assuming 2903 * 1 reboot per second, or we have a bug... 2904 */ 2905 WARN_ON(mddev->events == 0); 2906 2907 rdev_for_each(rdev, mddev) { 2908 if (rdev->badblocks.changed) 2909 any_badblocks_changed++; 2910 if (test_bit(Faulty, &rdev->flags)) 2911 set_bit(FaultRecorded, &rdev->flags); 2912 } 2913 2914 sync_sbs(mddev, nospares); 2915 spin_unlock(&mddev->lock); 2916 2917 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2918 mdname(mddev), mddev->in_sync); 2919 2920 mddev_add_trace_msg(mddev, "md md_update_sb"); 2921rewrite: 2922 if (md_bitmap_enabled(mddev, false)) 2923 mddev->bitmap_ops->update_sb(mddev->bitmap); 2924 rdev_for_each(rdev, mddev) { 2925 if (rdev->sb_loaded != 1) 2926 continue; /* no noise on spare devices */ 2927 2928 if (!test_bit(Faulty, &rdev->flags)) { 2929 md_write_metadata(mddev, rdev, rdev->sb_start, 2930 rdev->sb_size, rdev->sb_page, 0); 2931 pr_debug("md: (write) %pg's sb offset: %llu\n", 2932 rdev->bdev, 2933 (unsigned long long)rdev->sb_start); 2934 rdev->sb_events = mddev->events; 2935 if (rdev->badblocks.size) { 2936 md_write_metadata(mddev, rdev, 2937 rdev->badblocks.sector, 2938 rdev->badblocks.size << 9, 2939 rdev->bb_page, 0); 2940 rdev->badblocks.size = 0; 2941 } 2942 2943 } else 2944 pr_debug("md: %pg (skipping faulty)\n", 2945 rdev->bdev); 2946 } 2947 if (md_super_wait(mddev) < 0) 2948 goto rewrite; 2949 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2950 2951 if (mddev_is_clustered(mddev) && ret == 0) 2952 mddev->cluster_ops->metadata_update_finish(mddev); 2953 2954 if (mddev->in_sync != sync_req || 2955 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2956 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2957 /* have to write it out again */ 2958 goto repeat; 2959 wake_up(&mddev->sb_wait); 2960 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2961 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2962 2963 rdev_for_each(rdev, mddev) { 2964 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2965 clear_bit(Blocked, &rdev->flags); 2966 2967 if (any_badblocks_changed) 2968 ack_all_badblocks(&rdev->badblocks); 2969 clear_bit(BlockedBadBlocks, &rdev->flags); 2970 wake_up(&rdev->blocked_wait); 2971 } 2972} 2973EXPORT_SYMBOL(md_update_sb); 2974 2975static int add_bound_rdev(struct md_rdev *rdev) 2976{ 2977 struct mddev *mddev = rdev->mddev; 2978 int err = 0; 2979 bool add_journal = test_bit(Journal, &rdev->flags); 2980 2981 if (!mddev->pers->hot_remove_disk || add_journal) { 2982 /* If there is hot_add_disk but no hot_remove_disk 2983 * then added disks for geometry changes, 2984 * and should be added immediately. 2985 */ 2986 super_types[mddev->major_version]. 2987 validate_super(mddev, NULL/*freshest*/, rdev); 2988 err = mddev->pers->hot_add_disk(mddev, rdev); 2989 if (err) { 2990 md_kick_rdev_from_array(rdev); 2991 return err; 2992 } 2993 } 2994 sysfs_notify_dirent_safe(rdev->sysfs_state); 2995 2996 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2997 if (mddev->degraded) 2998 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2999 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3000 md_new_event(); 3001 return 0; 3002} 3003 3004/* words written to sysfs files may, or may not, be \n terminated. 3005 * We want to accept with case. For this we use cmd_match. 3006 */ 3007static int cmd_match(const char *cmd, const char *str) 3008{ 3009 /* See if cmd, written into a sysfs file, matches 3010 * str. They must either be the same, or cmd can 3011 * have a trailing newline 3012 */ 3013 while (*cmd && *str && *cmd == *str) { 3014 cmd++; 3015 str++; 3016 } 3017 if (*cmd == '\n') 3018 cmd++; 3019 if (*str || *cmd) 3020 return 0; 3021 return 1; 3022} 3023 3024struct rdev_sysfs_entry { 3025 struct attribute attr; 3026 ssize_t (*show)(struct md_rdev *, char *); 3027 ssize_t (*store)(struct md_rdev *, const char *, size_t); 3028}; 3029 3030static ssize_t 3031state_show(struct md_rdev *rdev, char *page) 3032{ 3033 char *sep = ","; 3034 size_t len = 0; 3035 unsigned long flags = READ_ONCE(rdev->flags); 3036 3037 if (test_bit(Faulty, &flags) || 3038 (!test_bit(ExternalBbl, &flags) && 3039 rdev->badblocks.unacked_exist)) 3040 len += sprintf(page+len, "faulty%s", sep); 3041 if (test_bit(In_sync, &flags)) 3042 len += sprintf(page+len, "in_sync%s", sep); 3043 if (test_bit(Journal, &flags)) 3044 len += sprintf(page+len, "journal%s", sep); 3045 if (test_bit(WriteMostly, &flags)) 3046 len += sprintf(page+len, "write_mostly%s", sep); 3047 if (test_bit(Blocked, &flags) || 3048 (rdev->badblocks.unacked_exist 3049 && !test_bit(Faulty, &flags))) 3050 len += sprintf(page+len, "blocked%s", sep); 3051 if (!test_bit(Faulty, &flags) && 3052 !test_bit(Journal, &flags) && 3053 !test_bit(In_sync, &flags)) 3054 len += sprintf(page+len, "spare%s", sep); 3055 if (test_bit(WriteErrorSeen, &flags)) 3056 len += sprintf(page+len, "write_error%s", sep); 3057 if (test_bit(WantReplacement, &flags)) 3058 len += sprintf(page+len, "want_replacement%s", sep); 3059 if (test_bit(Replacement, &flags)) 3060 len += sprintf(page+len, "replacement%s", sep); 3061 if (test_bit(ExternalBbl, &flags)) 3062 len += sprintf(page+len, "external_bbl%s", sep); 3063 if (test_bit(FailFast, &flags)) 3064 len += sprintf(page+len, "failfast%s", sep); 3065 3066 if (len) 3067 len -= strlen(sep); 3068 3069 return len+sprintf(page+len, "\n"); 3070} 3071 3072static ssize_t 3073state_store(struct md_rdev *rdev, const char *buf, size_t len) 3074{ 3075 /* can write 3076 * faulty - simulates an error 3077 * remove - disconnects the device 3078 * writemostly - sets write_mostly 3079 * -writemostly - clears write_mostly 3080 * blocked - sets the Blocked flags 3081 * -blocked - clears the Blocked and possibly simulates an error 3082 * insync - sets Insync providing device isn't active 3083 * -insync - clear Insync for a device with a slot assigned, 3084 * so that it gets rebuilt based on bitmap 3085 * write_error - sets WriteErrorSeen 3086 * -write_error - clears WriteErrorSeen 3087 * {,-}failfast - set/clear FailFast 3088 */ 3089 3090 struct mddev *mddev = rdev->mddev; 3091 int err = -EINVAL; 3092 bool need_update_sb = false; 3093 3094 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 3095 md_error(rdev->mddev, rdev); 3096 3097 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 3098 err = -EBUSY; 3099 else 3100 err = 0; 3101 } else if (cmd_match(buf, "remove")) { 3102 if (rdev->mddev->pers) { 3103 clear_bit(Blocked, &rdev->flags); 3104 remove_and_add_spares(rdev->mddev, rdev); 3105 } 3106 if (rdev->raid_disk >= 0) 3107 err = -EBUSY; 3108 else { 3109 err = 0; 3110 if (mddev_is_clustered(mddev)) 3111 err = mddev->cluster_ops->remove_disk(mddev, rdev); 3112 3113 if (err == 0) { 3114 md_kick_rdev_from_array(rdev); 3115 if (mddev->pers) 3116 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3117 md_new_event(); 3118 } 3119 } 3120 } else if (cmd_match(buf, "writemostly")) { 3121 set_bit(WriteMostly, &rdev->flags); 3122 mddev_create_serial_pool(rdev->mddev, rdev); 3123 need_update_sb = true; 3124 err = 0; 3125 } else if (cmd_match(buf, "-writemostly")) { 3126 mddev_destroy_serial_pool(rdev->mddev, rdev); 3127 clear_bit(WriteMostly, &rdev->flags); 3128 need_update_sb = true; 3129 err = 0; 3130 } else if (cmd_match(buf, "blocked")) { 3131 set_bit(Blocked, &rdev->flags); 3132 err = 0; 3133 } else if (cmd_match(buf, "-blocked")) { 3134 if (!test_bit(Faulty, &rdev->flags) && 3135 !test_bit(ExternalBbl, &rdev->flags) && 3136 rdev->badblocks.unacked_exist) { 3137 /* metadata handler doesn't understand badblocks, 3138 * so we need to fail the device 3139 */ 3140 md_error(rdev->mddev, rdev); 3141 } 3142 clear_bit(Blocked, &rdev->flags); 3143 clear_bit(BlockedBadBlocks, &rdev->flags); 3144 wake_up(&rdev->blocked_wait); 3145 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3146 3147 err = 0; 3148 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3149 set_bit(In_sync, &rdev->flags); 3150 err = 0; 3151 } else if (cmd_match(buf, "failfast")) { 3152 set_bit(FailFast, &rdev->flags); 3153 need_update_sb = true; 3154 err = 0; 3155 } else if (cmd_match(buf, "-failfast")) { 3156 clear_bit(FailFast, &rdev->flags); 3157 need_update_sb = true; 3158 err = 0; 3159 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3160 !test_bit(Journal, &rdev->flags)) { 3161 if (rdev->mddev->pers == NULL) { 3162 clear_bit(In_sync, &rdev->flags); 3163 rdev->saved_raid_disk = rdev->raid_disk; 3164 rdev->raid_disk = -1; 3165 err = 0; 3166 } 3167 } else if (cmd_match(buf, "write_error")) { 3168 set_bit(WriteErrorSeen, &rdev->flags); 3169 err = 0; 3170 } else if (cmd_match(buf, "-write_error")) { 3171 clear_bit(WriteErrorSeen, &rdev->flags); 3172 err = 0; 3173 } else if (cmd_match(buf, "want_replacement")) { 3174 /* Any non-spare device that is not a replacement can 3175 * become want_replacement at any time, but we then need to 3176 * check if recovery is needed. 3177 */ 3178 if (rdev->raid_disk >= 0 && 3179 !test_bit(Journal, &rdev->flags) && 3180 !test_bit(Replacement, &rdev->flags)) 3181 set_bit(WantReplacement, &rdev->flags); 3182 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3183 err = 0; 3184 } else if (cmd_match(buf, "-want_replacement")) { 3185 /* Clearing 'want_replacement' is always allowed. 3186 * Once replacements starts it is too late though. 3187 */ 3188 err = 0; 3189 clear_bit(WantReplacement, &rdev->flags); 3190 } else if (cmd_match(buf, "replacement")) { 3191 /* Can only set a device as a replacement when array has not 3192 * yet been started. Once running, replacement is automatic 3193 * from spares, or by assigning 'slot'. 3194 */ 3195 if (rdev->mddev->pers) 3196 err = -EBUSY; 3197 else { 3198 set_bit(Replacement, &rdev->flags); 3199 err = 0; 3200 } 3201 } else if (cmd_match(buf, "-replacement")) { 3202 /* Similarly, can only clear Replacement before start */ 3203 if (rdev->mddev->pers) 3204 err = -EBUSY; 3205 else { 3206 clear_bit(Replacement, &rdev->flags); 3207 err = 0; 3208 } 3209 } else if (cmd_match(buf, "re-add")) { 3210 if (!rdev->mddev->pers) 3211 err = -EINVAL; 3212 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3213 rdev->saved_raid_disk >= 0) { 3214 /* clear_bit is performed _after_ all the devices 3215 * have their local Faulty bit cleared. If any writes 3216 * happen in the meantime in the local node, they 3217 * will land in the local bitmap, which will be synced 3218 * by this node eventually 3219 */ 3220 if (!mddev_is_clustered(rdev->mddev) || 3221 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { 3222 clear_bit(Faulty, &rdev->flags); 3223 err = add_bound_rdev(rdev); 3224 } 3225 } else 3226 err = -EBUSY; 3227 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3228 set_bit(ExternalBbl, &rdev->flags); 3229 rdev->badblocks.shift = 0; 3230 err = 0; 3231 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3232 clear_bit(ExternalBbl, &rdev->flags); 3233 err = 0; 3234 } 3235 if (need_update_sb) 3236 md_update_sb(mddev, 1); 3237 if (!err) 3238 sysfs_notify_dirent_safe(rdev->sysfs_state); 3239 return err ? err : len; 3240} 3241static struct rdev_sysfs_entry rdev_state = 3242__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3243 3244static ssize_t 3245errors_show(struct md_rdev *rdev, char *page) 3246{ 3247 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3248} 3249 3250static ssize_t 3251errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3252{ 3253 unsigned int n; 3254 int rv; 3255 3256 rv = kstrtouint(buf, 10, &n); 3257 if (rv < 0) 3258 return rv; 3259 atomic_set(&rdev->corrected_errors, n); 3260 return len; 3261} 3262static struct rdev_sysfs_entry rdev_errors = 3263__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3264 3265static ssize_t 3266slot_show(struct md_rdev *rdev, char *page) 3267{ 3268 if (test_bit(Journal, &rdev->flags)) 3269 return sprintf(page, "journal\n"); 3270 else if (rdev->raid_disk < 0) 3271 return sprintf(page, "none\n"); 3272 else 3273 return sprintf(page, "%d\n", rdev->raid_disk); 3274} 3275 3276static ssize_t 3277slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3278{ 3279 int slot; 3280 int err; 3281 3282 if (test_bit(Journal, &rdev->flags)) 3283 return -EBUSY; 3284 if (strncmp(buf, "none", 4)==0) 3285 slot = -1; 3286 else { 3287 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3288 if (err < 0) 3289 return err; 3290 if (slot < 0) 3291 /* overflow */ 3292 return -ENOSPC; 3293 } 3294 if (rdev->mddev->pers && slot == -1) { 3295 /* Setting 'slot' on an active array requires also 3296 * updating the 'rd%d' link, and communicating 3297 * with the personality with ->hot_*_disk. 3298 * For now we only support removing 3299 * failed/spare devices. This normally happens automatically, 3300 * but not when the metadata is externally managed. 3301 */ 3302 if (rdev->raid_disk == -1) 3303 return -EEXIST; 3304 /* personality does all needed checks */ 3305 if (rdev->mddev->pers->hot_remove_disk == NULL) 3306 return -EINVAL; 3307 clear_bit(Blocked, &rdev->flags); 3308 remove_and_add_spares(rdev->mddev, rdev); 3309 if (rdev->raid_disk >= 0) 3310 return -EBUSY; 3311 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3312 } else if (rdev->mddev->pers) { 3313 /* Activating a spare .. or possibly reactivating 3314 * if we ever get bitmaps working here. 3315 */ 3316 int err; 3317 3318 if (rdev->raid_disk != -1) 3319 return -EBUSY; 3320 3321 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3322 return -EBUSY; 3323 3324 if (rdev->mddev->pers->hot_add_disk == NULL) 3325 return -EINVAL; 3326 3327 if (slot >= rdev->mddev->raid_disks && 3328 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3329 return -ENOSPC; 3330 3331 rdev->raid_disk = slot; 3332 if (test_bit(In_sync, &rdev->flags)) 3333 rdev->saved_raid_disk = slot; 3334 else 3335 rdev->saved_raid_disk = -1; 3336 clear_bit(In_sync, &rdev->flags); 3337 clear_bit(Bitmap_sync, &rdev->flags); 3338 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3339 if (err) { 3340 rdev->raid_disk = -1; 3341 return err; 3342 } else 3343 sysfs_notify_dirent_safe(rdev->sysfs_state); 3344 /* failure here is OK */; 3345 sysfs_link_rdev(rdev->mddev, rdev); 3346 /* don't wakeup anyone, leave that to userspace. */ 3347 } else { 3348 if (slot >= rdev->mddev->raid_disks && 3349 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3350 return -ENOSPC; 3351 rdev->raid_disk = slot; 3352 /* assume it is working */ 3353 clear_bit(Faulty, &rdev->flags); 3354 clear_bit(WriteMostly, &rdev->flags); 3355 set_bit(In_sync, &rdev->flags); 3356 sysfs_notify_dirent_safe(rdev->sysfs_state); 3357 } 3358 return len; 3359} 3360 3361static struct rdev_sysfs_entry rdev_slot = 3362__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3363 3364static ssize_t 3365offset_show(struct md_rdev *rdev, char *page) 3366{ 3367 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3368} 3369 3370static ssize_t 3371offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3372{ 3373 unsigned long long offset; 3374 if (kstrtoull(buf, 10, &offset) < 0) 3375 return -EINVAL; 3376 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3377 return -EBUSY; 3378 if (rdev->sectors && rdev->mddev->external) 3379 /* Must set offset before size, so overlap checks 3380 * can be sane */ 3381 return -EBUSY; 3382 rdev->data_offset = offset; 3383 rdev->new_data_offset = offset; 3384 return len; 3385} 3386 3387static struct rdev_sysfs_entry rdev_offset = 3388__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3389 3390static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3391{ 3392 return sprintf(page, "%llu\n", 3393 (unsigned long long)rdev->new_data_offset); 3394} 3395 3396static ssize_t new_offset_store(struct md_rdev *rdev, 3397 const char *buf, size_t len) 3398{ 3399 unsigned long long new_offset; 3400 struct mddev *mddev = rdev->mddev; 3401 3402 if (kstrtoull(buf, 10, &new_offset) < 0) 3403 return -EINVAL; 3404 3405 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3406 return -EBUSY; 3407 if (new_offset == rdev->data_offset) 3408 /* reset is always permitted */ 3409 ; 3410 else if (new_offset > rdev->data_offset) { 3411 /* must not push array size beyond rdev_sectors */ 3412 if (new_offset - rdev->data_offset 3413 + mddev->dev_sectors > rdev->sectors) 3414 return -E2BIG; 3415 } 3416 /* Metadata worries about other space details. */ 3417 3418 /* decreasing the offset is inconsistent with a backwards 3419 * reshape. 3420 */ 3421 if (new_offset < rdev->data_offset && 3422 mddev->reshape_backwards) 3423 return -EINVAL; 3424 /* Increasing offset is inconsistent with forwards 3425 * reshape. reshape_direction should be set to 3426 * 'backwards' first. 3427 */ 3428 if (new_offset > rdev->data_offset && 3429 !mddev->reshape_backwards) 3430 return -EINVAL; 3431 3432 if (mddev->pers && mddev->persistent && 3433 !super_types[mddev->major_version] 3434 .allow_new_offset(rdev, new_offset)) 3435 return -E2BIG; 3436 rdev->new_data_offset = new_offset; 3437 if (new_offset > rdev->data_offset) 3438 mddev->reshape_backwards = 1; 3439 else if (new_offset < rdev->data_offset) 3440 mddev->reshape_backwards = 0; 3441 3442 return len; 3443} 3444static struct rdev_sysfs_entry rdev_new_offset = 3445__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3446 3447static ssize_t 3448rdev_size_show(struct md_rdev *rdev, char *page) 3449{ 3450 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3451} 3452 3453static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3454{ 3455 /* check if two start/length pairs overlap */ 3456 if (a->data_offset + a->sectors <= b->data_offset) 3457 return false; 3458 if (b->data_offset + b->sectors <= a->data_offset) 3459 return false; 3460 return true; 3461} 3462 3463static bool md_rdev_overlaps(struct md_rdev *rdev) 3464{ 3465 struct mddev *mddev; 3466 struct md_rdev *rdev2; 3467 3468 spin_lock(&all_mddevs_lock); 3469 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3470 if (test_bit(MD_DELETED, &mddev->flags)) 3471 continue; 3472 rdev_for_each(rdev2, mddev) { 3473 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3474 md_rdevs_overlap(rdev, rdev2)) { 3475 spin_unlock(&all_mddevs_lock); 3476 return true; 3477 } 3478 } 3479 } 3480 spin_unlock(&all_mddevs_lock); 3481 return false; 3482} 3483 3484static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3485{ 3486 unsigned long long blocks; 3487 sector_t new; 3488 3489 if (kstrtoull(buf, 10, &blocks) < 0) 3490 return -EINVAL; 3491 3492 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3493 return -EINVAL; /* sector conversion overflow */ 3494 3495 new = blocks * 2; 3496 if (new != blocks * 2) 3497 return -EINVAL; /* unsigned long long to sector_t overflow */ 3498 3499 *sectors = new; 3500 return 0; 3501} 3502 3503static ssize_t 3504rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3505{ 3506 struct mddev *my_mddev = rdev->mddev; 3507 sector_t oldsectors = rdev->sectors; 3508 sector_t sectors; 3509 3510 if (test_bit(Journal, &rdev->flags)) 3511 return -EBUSY; 3512 if (strict_blocks_to_sectors(buf, &sectors) < 0) 3513 return -EINVAL; 3514 if (rdev->data_offset != rdev->new_data_offset) 3515 return -EINVAL; /* too confusing */ 3516 if (my_mddev->pers && rdev->raid_disk >= 0) { 3517 if (my_mddev->persistent) { 3518 sectors = super_types[my_mddev->major_version]. 3519 rdev_size_change(rdev, sectors); 3520 if (!sectors) 3521 return -EBUSY; 3522 } else if (!sectors) 3523 sectors = bdev_nr_sectors(rdev->bdev) - 3524 rdev->data_offset; 3525 if (!my_mddev->pers->resize) 3526 /* Cannot change size for RAID0 or Linear etc */ 3527 return -EINVAL; 3528 } 3529 if (sectors < my_mddev->dev_sectors) 3530 return -EINVAL; /* component must fit device */ 3531 3532 rdev->sectors = sectors; 3533 3534 /* 3535 * Check that all other rdevs with the same bdev do not overlap. This 3536 * check does not provide a hard guarantee, it just helps avoid 3537 * dangerous mistakes. 3538 */ 3539 if (sectors > oldsectors && my_mddev->external && 3540 md_rdev_overlaps(rdev)) { 3541 /* 3542 * Someone else could have slipped in a size change here, but 3543 * doing so is just silly. We put oldsectors back because we 3544 * know it is safe, and trust userspace not to race with itself. 3545 */ 3546 rdev->sectors = oldsectors; 3547 return -EBUSY; 3548 } 3549 return len; 3550} 3551 3552static struct rdev_sysfs_entry rdev_size = 3553__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3554 3555static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3556{ 3557 unsigned long long recovery_start = rdev->recovery_offset; 3558 3559 if (test_bit(In_sync, &rdev->flags) || 3560 recovery_start == MaxSector) 3561 return sprintf(page, "none\n"); 3562 3563 return sprintf(page, "%llu\n", recovery_start); 3564} 3565 3566static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3567{ 3568 unsigned long long recovery_start; 3569 3570 if (cmd_match(buf, "none")) 3571 recovery_start = MaxSector; 3572 else if (kstrtoull(buf, 10, &recovery_start)) 3573 return -EINVAL; 3574 3575 if (rdev->mddev->pers && 3576 rdev->raid_disk >= 0) 3577 return -EBUSY; 3578 3579 rdev->recovery_offset = recovery_start; 3580 if (recovery_start == MaxSector) 3581 set_bit(In_sync, &rdev->flags); 3582 else 3583 clear_bit(In_sync, &rdev->flags); 3584 return len; 3585} 3586 3587static struct rdev_sysfs_entry rdev_recovery_start = 3588__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3589 3590/* sysfs access to bad-blocks list. 3591 * We present two files. 3592 * 'bad-blocks' lists sector numbers and lengths of ranges that 3593 * are recorded as bad. The list is truncated to fit within 3594 * the one-page limit of sysfs. 3595 * Writing "sector length" to this file adds an acknowledged 3596 * bad block list. 3597 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3598 * been acknowledged. Writing to this file adds bad blocks 3599 * without acknowledging them. This is largely for testing. 3600 */ 3601static ssize_t bb_show(struct md_rdev *rdev, char *page) 3602{ 3603 return badblocks_show(&rdev->badblocks, page, 0); 3604} 3605static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3606{ 3607 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3608 /* Maybe that ack was all we needed */ 3609 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3610 wake_up(&rdev->blocked_wait); 3611 return rv; 3612} 3613static struct rdev_sysfs_entry rdev_bad_blocks = 3614__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3615 3616static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3617{ 3618 return badblocks_show(&rdev->badblocks, page, 1); 3619} 3620static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3621{ 3622 return badblocks_store(&rdev->badblocks, page, len, 1); 3623} 3624static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3625__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3626 3627static ssize_t 3628ppl_sector_show(struct md_rdev *rdev, char *page) 3629{ 3630 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3631} 3632 3633static ssize_t 3634ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3635{ 3636 unsigned long long sector; 3637 3638 if (kstrtoull(buf, 10, &sector) < 0) 3639 return -EINVAL; 3640 if (sector != (sector_t)sector) 3641 return -EINVAL; 3642 3643 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3644 rdev->raid_disk >= 0) 3645 return -EBUSY; 3646 3647 if (rdev->mddev->persistent) { 3648 if (rdev->mddev->major_version == 0) 3649 return -EINVAL; 3650 if ((sector > rdev->sb_start && 3651 sector - rdev->sb_start > S16_MAX) || 3652 (sector < rdev->sb_start && 3653 rdev->sb_start - sector > -S16_MIN)) 3654 return -EINVAL; 3655 rdev->ppl.offset = sector - rdev->sb_start; 3656 } else if (!rdev->mddev->external) { 3657 return -EBUSY; 3658 } 3659 rdev->ppl.sector = sector; 3660 return len; 3661} 3662 3663static struct rdev_sysfs_entry rdev_ppl_sector = 3664__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3665 3666static ssize_t 3667ppl_size_show(struct md_rdev *rdev, char *page) 3668{ 3669 return sprintf(page, "%u\n", rdev->ppl.size); 3670} 3671 3672static ssize_t 3673ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3674{ 3675 unsigned int size; 3676 3677 if (kstrtouint(buf, 10, &size) < 0) 3678 return -EINVAL; 3679 3680 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3681 rdev->raid_disk >= 0) 3682 return -EBUSY; 3683 3684 if (rdev->mddev->persistent) { 3685 if (rdev->mddev->major_version == 0) 3686 return -EINVAL; 3687 if (size > U16_MAX) 3688 return -EINVAL; 3689 } else if (!rdev->mddev->external) { 3690 return -EBUSY; 3691 } 3692 rdev->ppl.size = size; 3693 return len; 3694} 3695 3696static struct rdev_sysfs_entry rdev_ppl_size = 3697__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3698 3699static struct attribute *rdev_default_attrs[] = { 3700 &rdev_state.attr, 3701 &rdev_errors.attr, 3702 &rdev_slot.attr, 3703 &rdev_offset.attr, 3704 &rdev_new_offset.attr, 3705 &rdev_size.attr, 3706 &rdev_recovery_start.attr, 3707 &rdev_bad_blocks.attr, 3708 &rdev_unack_bad_blocks.attr, 3709 &rdev_ppl_sector.attr, 3710 &rdev_ppl_size.attr, 3711 NULL, 3712}; 3713ATTRIBUTE_GROUPS(rdev_default); 3714static ssize_t 3715rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3716{ 3717 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3718 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3719 3720 if (!entry->show) 3721 return -EIO; 3722 if (!rdev->mddev) 3723 return -ENODEV; 3724 return entry->show(rdev, page); 3725} 3726 3727static ssize_t 3728rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3729 const char *page, size_t length) 3730{ 3731 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3732 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3733 struct kernfs_node *kn = NULL; 3734 bool suspend = false; 3735 ssize_t rv; 3736 struct mddev *mddev = READ_ONCE(rdev->mddev); 3737 3738 if (!entry->store) 3739 return -EIO; 3740 if (!capable(CAP_SYS_ADMIN)) 3741 return -EACCES; 3742 if (!mddev) 3743 return -ENODEV; 3744 3745 if (entry->store == state_store) { 3746 if (cmd_match(page, "remove")) 3747 kn = sysfs_break_active_protection(kobj, attr); 3748 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3749 cmd_match(page, "writemostly") || 3750 cmd_match(page, "-writemostly")) 3751 suspend = true; 3752 } 3753 3754 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3755 if (!rv) { 3756 if (rdev->mddev == NULL) 3757 rv = -ENODEV; 3758 else 3759 rv = entry->store(rdev, page, length); 3760 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3761 } 3762 3763 if (kn) 3764 sysfs_unbreak_active_protection(kn); 3765 3766 return rv; 3767} 3768 3769static void rdev_free(struct kobject *ko) 3770{ 3771 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3772 kfree(rdev); 3773} 3774static const struct sysfs_ops rdev_sysfs_ops = { 3775 .show = rdev_attr_show, 3776 .store = rdev_attr_store, 3777}; 3778static const struct kobj_type rdev_ktype = { 3779 .release = rdev_free, 3780 .sysfs_ops = &rdev_sysfs_ops, 3781 .default_groups = rdev_default_groups, 3782}; 3783 3784int md_rdev_init(struct md_rdev *rdev) 3785{ 3786 rdev->desc_nr = -1; 3787 rdev->saved_raid_disk = -1; 3788 rdev->raid_disk = -1; 3789 rdev->flags = 0; 3790 rdev->data_offset = 0; 3791 rdev->new_data_offset = 0; 3792 rdev->sb_events = 0; 3793 rdev->last_read_error = 0; 3794 rdev->sb_loaded = 0; 3795 rdev->bb_page = NULL; 3796 atomic_set(&rdev->nr_pending, 0); 3797 atomic_set(&rdev->read_errors, 0); 3798 atomic_set(&rdev->corrected_errors, 0); 3799 3800 INIT_LIST_HEAD(&rdev->same_set); 3801 init_waitqueue_head(&rdev->blocked_wait); 3802 3803 /* Add space to store bad block list. 3804 * This reserves the space even on arrays where it cannot 3805 * be used - I wonder if that matters 3806 */ 3807 return badblocks_init(&rdev->badblocks, 0); 3808} 3809EXPORT_SYMBOL_GPL(md_rdev_init); 3810 3811/* 3812 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3813 * 3814 * mark the device faulty if: 3815 * 3816 * - the device is nonexistent (zero size) 3817 * - the device has no valid superblock 3818 * 3819 * a faulty rdev _never_ has rdev->sb set. 3820 */ 3821static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3822{ 3823 struct md_rdev *rdev; 3824 sector_t size; 3825 int err; 3826 3827 rdev = kzalloc_obj(*rdev); 3828 if (!rdev) 3829 return ERR_PTR(-ENOMEM); 3830 3831 err = md_rdev_init(rdev); 3832 if (err) 3833 goto out_free_rdev; 3834 err = alloc_disk_sb(rdev); 3835 if (err) 3836 goto out_clear_rdev; 3837 3838 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3839 BLK_OPEN_READ | BLK_OPEN_WRITE, 3840 super_format == -2 ? &claim_rdev : rdev, NULL); 3841 if (IS_ERR(rdev->bdev_file)) { 3842 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3843 MAJOR(newdev), MINOR(newdev)); 3844 err = PTR_ERR(rdev->bdev_file); 3845 goto out_clear_rdev; 3846 } 3847 rdev->bdev = file_bdev(rdev->bdev_file); 3848 3849 kobject_init(&rdev->kobj, &rdev_ktype); 3850 3851 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3852 if (!size) { 3853 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3854 rdev->bdev); 3855 err = -EINVAL; 3856 goto out_blkdev_put; 3857 } 3858 3859 if (super_format >= 0) { 3860 err = super_types[super_format]. 3861 load_super(rdev, NULL, super_minor); 3862 if (err == -EINVAL) { 3863 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3864 rdev->bdev, 3865 super_format, super_minor); 3866 goto out_blkdev_put; 3867 } 3868 if (err < 0) { 3869 pr_warn("md: could not read %pg's sb, not importing!\n", 3870 rdev->bdev); 3871 goto out_blkdev_put; 3872 } 3873 } 3874 3875 return rdev; 3876 3877out_blkdev_put: 3878 fput(rdev->bdev_file); 3879out_clear_rdev: 3880 md_rdev_clear(rdev); 3881out_free_rdev: 3882 kfree(rdev); 3883 return ERR_PTR(err); 3884} 3885 3886/* 3887 * Check a full RAID array for plausibility 3888 */ 3889 3890static int analyze_sbs(struct mddev *mddev) 3891{ 3892 struct md_rdev *rdev, *freshest, *tmp; 3893 3894 freshest = NULL; 3895 rdev_for_each_safe(rdev, tmp, mddev) 3896 switch (super_types[mddev->major_version]. 3897 load_super(rdev, freshest, mddev->minor_version)) { 3898 case 1: 3899 freshest = rdev; 3900 break; 3901 case 0: 3902 break; 3903 default: 3904 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3905 rdev->bdev); 3906 md_kick_rdev_from_array(rdev); 3907 } 3908 3909 /* Cannot find a valid fresh disk */ 3910 if (!freshest) { 3911 pr_warn("md: cannot find a valid disk\n"); 3912 return -EINVAL; 3913 } 3914 3915 super_types[mddev->major_version]. 3916 validate_super(mddev, NULL/*freshest*/, freshest); 3917 3918 rdev_for_each_safe(rdev, tmp, mddev) { 3919 if (mddev->max_disks && 3920 rdev->desc_nr >= mddev->max_disks) { 3921 pr_warn("md: %s: %pg: only %d devices permitted\n", 3922 mdname(mddev), rdev->bdev, 3923 mddev->max_disks); 3924 md_kick_rdev_from_array(rdev); 3925 continue; 3926 } 3927 if (rdev != freshest) { 3928 if (super_types[mddev->major_version]. 3929 validate_super(mddev, freshest, rdev)) { 3930 pr_warn("md: kicking non-fresh %pg from array!\n", 3931 rdev->bdev); 3932 md_kick_rdev_from_array(rdev); 3933 continue; 3934 } 3935 } 3936 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3937 !test_bit(Journal, &rdev->flags)) { 3938 rdev->raid_disk = -1; 3939 clear_bit(In_sync, &rdev->flags); 3940 } 3941 } 3942 3943 return 0; 3944} 3945 3946/* Read a fixed-point number. 3947 * Numbers in sysfs attributes should be in "standard" units where 3948 * possible, so time should be in seconds. 3949 * However we internally use a a much smaller unit such as 3950 * milliseconds or jiffies. 3951 * This function takes a decimal number with a possible fractional 3952 * component, and produces an integer which is the result of 3953 * multiplying that number by 10^'scale'. 3954 * all without any floating-point arithmetic. 3955 */ 3956int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3957{ 3958 unsigned long result = 0; 3959 long decimals = -1; 3960 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3961 if (*cp == '.') 3962 decimals = 0; 3963 else if (decimals < scale) { 3964 unsigned int value; 3965 value = *cp - '0'; 3966 result = result * 10 + value; 3967 if (decimals >= 0) 3968 decimals++; 3969 } 3970 cp++; 3971 } 3972 if (*cp == '\n') 3973 cp++; 3974 if (*cp) 3975 return -EINVAL; 3976 if (decimals < 0) 3977 decimals = 0; 3978 *res = result * int_pow(10, scale - decimals); 3979 return 0; 3980} 3981 3982static ssize_t 3983safe_delay_show(struct mddev *mddev, char *page) 3984{ 3985 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3986 3987 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3988} 3989static ssize_t 3990safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3991{ 3992 unsigned long msec; 3993 3994 if (mddev_is_clustered(mddev)) { 3995 pr_warn("md: Safemode is disabled for clustered mode\n"); 3996 return -EINVAL; 3997 } 3998 3999 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 4000 return -EINVAL; 4001 if (msec == 0) 4002 mddev->safemode_delay = 0; 4003 else { 4004 unsigned long old_delay = mddev->safemode_delay; 4005 unsigned long new_delay = (msec*HZ)/1000; 4006 4007 if (new_delay == 0) 4008 new_delay = 1; 4009 mddev->safemode_delay = new_delay; 4010 if (new_delay < old_delay || old_delay == 0) 4011 mod_timer(&mddev->safemode_timer, jiffies+1); 4012 } 4013 return len; 4014} 4015static struct md_sysfs_entry md_safe_delay = 4016__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 4017 4018static ssize_t 4019level_show(struct mddev *mddev, char *page) 4020{ 4021 struct md_personality *p; 4022 int ret; 4023 spin_lock(&mddev->lock); 4024 p = mddev->pers; 4025 if (p) 4026 ret = sprintf(page, "%s\n", p->head.name); 4027 else if (mddev->clevel[0]) 4028 ret = sprintf(page, "%s\n", mddev->clevel); 4029 else if (mddev->level != LEVEL_NONE) 4030 ret = sprintf(page, "%d\n", mddev->level); 4031 else 4032 ret = 0; 4033 spin_unlock(&mddev->lock); 4034 return ret; 4035} 4036 4037static ssize_t 4038level_store(struct mddev *mddev, const char *buf, size_t len) 4039{ 4040 char clevel[16]; 4041 ssize_t rv; 4042 size_t slen = len; 4043 struct md_personality *pers, *oldpers; 4044 long level; 4045 void *priv, *oldpriv; 4046 struct md_rdev *rdev; 4047 4048 if (slen == 0 || slen >= sizeof(clevel)) 4049 return -EINVAL; 4050 4051 rv = mddev_suspend_and_lock(mddev); 4052 if (rv) 4053 return rv; 4054 4055 if (mddev->pers == NULL) { 4056 memcpy(mddev->clevel, buf, slen); 4057 if (mddev->clevel[slen-1] == '\n') 4058 slen--; 4059 mddev->clevel[slen] = 0; 4060 mddev->level = LEVEL_NONE; 4061 rv = len; 4062 goto out_unlock; 4063 } 4064 rv = -EROFS; 4065 if (!md_is_rdwr(mddev)) 4066 goto out_unlock; 4067 4068 /* request to change the personality. Need to ensure: 4069 * - array is not engaged in resync/recovery/reshape 4070 * - old personality can be suspended 4071 * - new personality will access other array. 4072 */ 4073 4074 rv = -EBUSY; 4075 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4076 mddev->reshape_position != MaxSector || 4077 mddev->sysfs_active) 4078 goto out_unlock; 4079 4080 rv = -EINVAL; 4081 if (!mddev->pers->quiesce) { 4082 pr_warn("md: %s: %s does not support online personality change\n", 4083 mdname(mddev), mddev->pers->head.name); 4084 goto out_unlock; 4085 } 4086 4087 /* Now find the new personality */ 4088 memcpy(clevel, buf, slen); 4089 if (clevel[slen-1] == '\n') 4090 slen--; 4091 clevel[slen] = 0; 4092 if (kstrtol(clevel, 10, &level)) 4093 level = LEVEL_NONE; 4094 4095 if (request_module("md-%s", clevel) != 0) 4096 request_module("md-level-%s", clevel); 4097 pers = get_pers(level, clevel); 4098 if (!pers) { 4099 rv = -EINVAL; 4100 goto out_unlock; 4101 } 4102 4103 if (pers == mddev->pers) { 4104 /* Nothing to do! */ 4105 put_pers(pers); 4106 rv = len; 4107 goto out_unlock; 4108 } 4109 if (!pers->takeover) { 4110 put_pers(pers); 4111 pr_warn("md: %s: %s does not support personality takeover\n", 4112 mdname(mddev), clevel); 4113 rv = -EINVAL; 4114 goto out_unlock; 4115 } 4116 4117 rdev_for_each(rdev, mddev) 4118 rdev->new_raid_disk = rdev->raid_disk; 4119 4120 /* ->takeover must set new_* and/or delta_disks 4121 * if it succeeds, and may set them when it fails. 4122 */ 4123 priv = pers->takeover(mddev); 4124 if (IS_ERR(priv)) { 4125 mddev->new_level = mddev->level; 4126 mddev->new_layout = mddev->layout; 4127 mddev->new_chunk_sectors = mddev->chunk_sectors; 4128 mddev->raid_disks -= mddev->delta_disks; 4129 mddev->delta_disks = 0; 4130 mddev->reshape_backwards = 0; 4131 put_pers(pers); 4132 pr_warn("md: %s: %s would not accept array\n", 4133 mdname(mddev), clevel); 4134 rv = PTR_ERR(priv); 4135 goto out_unlock; 4136 } 4137 4138 /* Looks like we have a winner */ 4139 mddev_detach(mddev); 4140 4141 spin_lock(&mddev->lock); 4142 oldpers = mddev->pers; 4143 oldpriv = mddev->private; 4144 mddev->pers = pers; 4145 mddev->private = priv; 4146 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 4147 mddev->level = mddev->new_level; 4148 mddev->layout = mddev->new_layout; 4149 mddev->chunk_sectors = mddev->new_chunk_sectors; 4150 mddev->delta_disks = 0; 4151 mddev->reshape_backwards = 0; 4152 mddev->degraded = 0; 4153 spin_unlock(&mddev->lock); 4154 4155 if (oldpers->sync_request == NULL && 4156 mddev->external) { 4157 /* We are converting from a no-redundancy array 4158 * to a redundancy array and metadata is managed 4159 * externally so we need to be sure that writes 4160 * won't block due to a need to transition 4161 * clean->dirty 4162 * until external management is started. 4163 */ 4164 mddev->in_sync = 0; 4165 mddev->safemode_delay = 0; 4166 mddev->safemode = 0; 4167 } 4168 4169 oldpers->free(mddev, oldpriv); 4170 4171 if (oldpers->sync_request == NULL && 4172 pers->sync_request != NULL) { 4173 /* need to add the md_redundancy_group */ 4174 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4175 pr_warn("md: cannot register extra attributes for %s\n", 4176 mdname(mddev)); 4177 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4178 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4179 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4180 } 4181 if (oldpers->sync_request != NULL && 4182 pers->sync_request == NULL) { 4183 /* need to remove the md_redundancy_group */ 4184 if (mddev->to_remove == NULL) 4185 mddev->to_remove = &md_redundancy_group; 4186 } 4187 4188 put_pers(oldpers); 4189 4190 rdev_for_each(rdev, mddev) { 4191 if (rdev->raid_disk < 0) 4192 continue; 4193 if (rdev->new_raid_disk >= mddev->raid_disks) 4194 rdev->new_raid_disk = -1; 4195 if (rdev->new_raid_disk == rdev->raid_disk) 4196 continue; 4197 sysfs_unlink_rdev(mddev, rdev); 4198 } 4199 rdev_for_each(rdev, mddev) { 4200 if (rdev->raid_disk < 0) 4201 continue; 4202 if (rdev->new_raid_disk == rdev->raid_disk) 4203 continue; 4204 rdev->raid_disk = rdev->new_raid_disk; 4205 if (rdev->raid_disk < 0) 4206 clear_bit(In_sync, &rdev->flags); 4207 else { 4208 if (sysfs_link_rdev(mddev, rdev)) 4209 pr_warn("md: cannot register rd%d for %s after level change\n", 4210 rdev->raid_disk, mdname(mddev)); 4211 } 4212 } 4213 4214 if (pers->sync_request == NULL) { 4215 /* this is now an array without redundancy, so 4216 * it must always be in_sync 4217 */ 4218 mddev->in_sync = 1; 4219 timer_delete_sync(&mddev->safemode_timer); 4220 } 4221 pers->run(mddev); 4222 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4223 if (!mddev->thread) 4224 md_update_sb(mddev, 1); 4225 sysfs_notify_dirent_safe(mddev->sysfs_level); 4226 md_new_event(); 4227 rv = len; 4228out_unlock: 4229 mddev_unlock_and_resume(mddev); 4230 return rv; 4231} 4232 4233static struct md_sysfs_entry md_level = 4234__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4235 4236static ssize_t 4237new_level_show(struct mddev *mddev, char *page) 4238{ 4239 return sprintf(page, "%d\n", mddev->new_level); 4240} 4241 4242static ssize_t 4243new_level_store(struct mddev *mddev, const char *buf, size_t len) 4244{ 4245 unsigned int n; 4246 int err; 4247 4248 err = kstrtouint(buf, 10, &n); 4249 if (err < 0) 4250 return err; 4251 err = mddev_lock(mddev); 4252 if (err) 4253 return err; 4254 4255 mddev->new_level = n; 4256 md_update_sb(mddev, 1); 4257 4258 mddev_unlock(mddev); 4259 return len; 4260} 4261static struct md_sysfs_entry md_new_level = 4262__ATTR(new_level, 0664, new_level_show, new_level_store); 4263 4264static ssize_t 4265bitmap_type_show(struct mddev *mddev, char *page) 4266{ 4267 struct md_submodule_head *head; 4268 unsigned long i; 4269 ssize_t len = 0; 4270 4271 if (mddev->bitmap_id == ID_BITMAP_NONE) 4272 len += sprintf(page + len, "[none] "); 4273 else 4274 len += sprintf(page + len, "none "); 4275 4276 xa_lock(&md_submodule); 4277 xa_for_each(&md_submodule, i, head) { 4278 if (head->type != MD_BITMAP || head->id == ID_BITMAP_NONE) 4279 continue; 4280 4281 if (mddev->bitmap_id == head->id) 4282 len += sprintf(page + len, "[%s] ", head->name); 4283 else 4284 len += sprintf(page + len, "%s ", head->name); 4285 } 4286 xa_unlock(&md_submodule); 4287 4288 len += sprintf(page + len, "\n"); 4289 return len; 4290} 4291 4292static ssize_t 4293bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) 4294{ 4295 struct md_submodule_head *head; 4296 enum md_submodule_id id; 4297 unsigned long i; 4298 int err = 0; 4299 4300 xa_lock(&md_submodule); 4301 4302 if (mddev->bitmap_ops) { 4303 err = -EBUSY; 4304 goto out; 4305 } 4306 4307 if (cmd_match(buf, "none")) { 4308 mddev->bitmap_id = ID_BITMAP_NONE; 4309 goto out; 4310 } 4311 4312 xa_for_each(&md_submodule, i, head) { 4313 if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { 4314 mddev->bitmap_id = head->id; 4315 goto out; 4316 } 4317 } 4318 4319 err = kstrtoint(buf, 10, &id); 4320 if (err) 4321 goto out; 4322 4323 if (id == ID_BITMAP_NONE) { 4324 mddev->bitmap_id = id; 4325 goto out; 4326 } 4327 4328 head = xa_load(&md_submodule, id); 4329 if (head && head->type == MD_BITMAP) { 4330 mddev->bitmap_id = id; 4331 goto out; 4332 } 4333 4334 err = -ENOENT; 4335 4336out: 4337 xa_unlock(&md_submodule); 4338 return err ? err : len; 4339} 4340 4341static struct md_sysfs_entry md_bitmap_type = 4342__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); 4343 4344static ssize_t 4345layout_show(struct mddev *mddev, char *page) 4346{ 4347 /* just a number, not meaningful for all levels */ 4348 if (mddev->reshape_position != MaxSector && 4349 mddev->layout != mddev->new_layout) 4350 return sprintf(page, "%d (%d)\n", 4351 mddev->new_layout, mddev->layout); 4352 return sprintf(page, "%d\n", mddev->layout); 4353} 4354 4355static ssize_t 4356layout_store(struct mddev *mddev, const char *buf, size_t len) 4357{ 4358 unsigned int n; 4359 int err; 4360 4361 err = kstrtouint(buf, 10, &n); 4362 if (err < 0) 4363 return err; 4364 err = mddev_lock(mddev); 4365 if (err) 4366 return err; 4367 4368 if (mddev->pers) { 4369 if (mddev->pers->check_reshape == NULL) 4370 err = -EBUSY; 4371 else if (!md_is_rdwr(mddev)) 4372 err = -EROFS; 4373 else { 4374 mddev->new_layout = n; 4375 err = mddev->pers->check_reshape(mddev); 4376 if (err) 4377 mddev->new_layout = mddev->layout; 4378 } 4379 } else { 4380 mddev->new_layout = n; 4381 if (mddev->reshape_position == MaxSector) 4382 mddev->layout = n; 4383 } 4384 mddev_unlock(mddev); 4385 return err ?: len; 4386} 4387static struct md_sysfs_entry md_layout = 4388__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4389 4390static ssize_t 4391raid_disks_show(struct mddev *mddev, char *page) 4392{ 4393 if (mddev->raid_disks == 0) 4394 return 0; 4395 if (mddev->reshape_position != MaxSector && 4396 mddev->delta_disks != 0) 4397 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4398 mddev->raid_disks - mddev->delta_disks); 4399 return sprintf(page, "%d\n", mddev->raid_disks); 4400} 4401 4402static int update_raid_disks(struct mddev *mddev, int raid_disks); 4403 4404static ssize_t 4405raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4406{ 4407 unsigned int n; 4408 int err; 4409 4410 err = kstrtouint(buf, 10, &n); 4411 if (err < 0) 4412 return err; 4413 4414 err = mddev_suspend_and_lock(mddev); 4415 if (err) 4416 return err; 4417 if (mddev->pers) 4418 err = update_raid_disks(mddev, n); 4419 else if (mddev->reshape_position != MaxSector) { 4420 struct md_rdev *rdev; 4421 int olddisks = mddev->raid_disks - mddev->delta_disks; 4422 4423 err = -EINVAL; 4424 rdev_for_each(rdev, mddev) { 4425 if (olddisks < n && 4426 rdev->data_offset < rdev->new_data_offset) 4427 goto out_unlock; 4428 if (olddisks > n && 4429 rdev->data_offset > rdev->new_data_offset) 4430 goto out_unlock; 4431 } 4432 err = 0; 4433 mddev->delta_disks = n - olddisks; 4434 mddev->raid_disks = n; 4435 mddev->reshape_backwards = (mddev->delta_disks < 0); 4436 } else 4437 mddev->raid_disks = n; 4438out_unlock: 4439 mddev_unlock_and_resume(mddev); 4440 return err ? err : len; 4441} 4442static struct md_sysfs_entry md_raid_disks = 4443__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4444 4445static ssize_t 4446uuid_show(struct mddev *mddev, char *page) 4447{ 4448 return sprintf(page, "%pU\n", mddev->uuid); 4449} 4450static struct md_sysfs_entry md_uuid = 4451__ATTR(uuid, S_IRUGO, uuid_show, NULL); 4452 4453static ssize_t 4454chunk_size_show(struct mddev *mddev, char *page) 4455{ 4456 if (mddev->reshape_position != MaxSector && 4457 mddev->chunk_sectors != mddev->new_chunk_sectors) 4458 return sprintf(page, "%d (%d)\n", 4459 mddev->new_chunk_sectors << 9, 4460 mddev->chunk_sectors << 9); 4461 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4462} 4463 4464static ssize_t 4465chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4466{ 4467 unsigned long n; 4468 int err; 4469 4470 err = kstrtoul(buf, 10, &n); 4471 if (err < 0) 4472 return err; 4473 4474 err = mddev_lock(mddev); 4475 if (err) 4476 return err; 4477 if (mddev->pers) { 4478 if (mddev->pers->check_reshape == NULL) 4479 err = -EBUSY; 4480 else if (!md_is_rdwr(mddev)) 4481 err = -EROFS; 4482 else { 4483 mddev->new_chunk_sectors = n >> 9; 4484 err = mddev->pers->check_reshape(mddev); 4485 if (err) 4486 mddev->new_chunk_sectors = mddev->chunk_sectors; 4487 } 4488 } else { 4489 mddev->new_chunk_sectors = n >> 9; 4490 if (mddev->reshape_position == MaxSector) 4491 mddev->chunk_sectors = n >> 9; 4492 } 4493 mddev_unlock(mddev); 4494 return err ?: len; 4495} 4496static struct md_sysfs_entry md_chunk_size = 4497__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4498 4499static ssize_t 4500resync_start_show(struct mddev *mddev, char *page) 4501{ 4502 if (mddev->resync_offset == MaxSector) 4503 return sprintf(page, "none\n"); 4504 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); 4505} 4506 4507static ssize_t 4508resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4509{ 4510 unsigned long long n; 4511 int err; 4512 4513 if (cmd_match(buf, "none")) 4514 n = MaxSector; 4515 else { 4516 err = kstrtoull(buf, 10, &n); 4517 if (err < 0) 4518 return err; 4519 if (n != (sector_t)n) 4520 return -EINVAL; 4521 } 4522 4523 err = mddev_lock(mddev); 4524 if (err) 4525 return err; 4526 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4527 err = -EBUSY; 4528 4529 if (!err) { 4530 mddev->resync_offset = n; 4531 if (mddev->pers) 4532 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4533 } 4534 mddev_unlock(mddev); 4535 return err ?: len; 4536} 4537static struct md_sysfs_entry md_resync_start = 4538__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4539 resync_start_show, resync_start_store); 4540 4541/* 4542 * The array state can be: 4543 * 4544 * clear 4545 * No devices, no size, no level 4546 * Equivalent to STOP_ARRAY ioctl 4547 * inactive 4548 * May have some settings, but array is not active 4549 * all IO results in error 4550 * When written, doesn't tear down array, but just stops it 4551 * suspended (not supported yet) 4552 * All IO requests will block. The array can be reconfigured. 4553 * Writing this, if accepted, will block until array is quiescent 4554 * readonly 4555 * no resync can happen. no superblocks get written. 4556 * write requests fail 4557 * read-auto 4558 * like readonly, but behaves like 'clean' on a write request. 4559 * 4560 * clean - no pending writes, but otherwise active. 4561 * When written to inactive array, starts without resync 4562 * If a write request arrives then 4563 * if metadata is known, mark 'dirty' and switch to 'active'. 4564 * if not known, block and switch to write-pending 4565 * If written to an active array that has pending writes, then fails. 4566 * active 4567 * fully active: IO and resync can be happening. 4568 * When written to inactive array, starts with resync 4569 * 4570 * write-pending 4571 * clean, but writes are blocked waiting for 'active' to be written. 4572 * 4573 * active-idle 4574 * like active, but no writes have been seen for a while (100msec). 4575 * 4576 * broken 4577* Array is failed. It's useful because mounted-arrays aren't stopped 4578* when array is failed, so this state will at least alert the user that 4579* something is wrong. 4580 */ 4581enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4582 write_pending, active_idle, broken, bad_word}; 4583static char *array_states[] = { 4584 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4585 "write-pending", "active-idle", "broken", NULL }; 4586 4587static int match_word(const char *word, char **list) 4588{ 4589 int n; 4590 for (n=0; list[n]; n++) 4591 if (cmd_match(word, list[n])) 4592 break; 4593 return n; 4594} 4595 4596static ssize_t 4597array_state_show(struct mddev *mddev, char *page) 4598{ 4599 enum array_state st = inactive; 4600 4601 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4602 switch(mddev->ro) { 4603 case MD_RDONLY: 4604 st = readonly; 4605 break; 4606 case MD_AUTO_READ: 4607 st = read_auto; 4608 break; 4609 case MD_RDWR: 4610 spin_lock(&mddev->lock); 4611 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4612 st = write_pending; 4613 else if (mddev->in_sync) 4614 st = clean; 4615 else if (mddev->safemode) 4616 st = active_idle; 4617 else 4618 st = active; 4619 spin_unlock(&mddev->lock); 4620 } 4621 4622 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4623 st = broken; 4624 } else { 4625 if (list_empty(&mddev->disks) && 4626 mddev->raid_disks == 0 && 4627 mddev->dev_sectors == 0) 4628 st = clear; 4629 else 4630 st = inactive; 4631 } 4632 return sprintf(page, "%s\n", array_states[st]); 4633} 4634 4635static int do_md_stop(struct mddev *mddev, int ro); 4636static int md_set_readonly(struct mddev *mddev); 4637static int restart_array(struct mddev *mddev); 4638 4639static ssize_t 4640array_state_store(struct mddev *mddev, const char *buf, size_t len) 4641{ 4642 int err = 0; 4643 enum array_state st = match_word(buf, array_states); 4644 4645 /* No lock dependent actions */ 4646 switch (st) { 4647 case suspended: /* not supported yet */ 4648 case write_pending: /* cannot be set */ 4649 case active_idle: /* cannot be set */ 4650 case broken: /* cannot be set */ 4651 case bad_word: 4652 return -EINVAL; 4653 case clear: 4654 case readonly: 4655 case inactive: 4656 case read_auto: 4657 if (!mddev->pers || !md_is_rdwr(mddev)) 4658 break; 4659 /* write sysfs will not open mddev and opener should be 0 */ 4660 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4661 if (err) 4662 return err; 4663 break; 4664 default: 4665 break; 4666 } 4667 4668 if (mddev->pers && (st == active || st == clean) && 4669 mddev->ro != MD_RDONLY) { 4670 /* don't take reconfig_mutex when toggling between 4671 * clean and active 4672 */ 4673 spin_lock(&mddev->lock); 4674 if (st == active) { 4675 restart_array(mddev); 4676 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4677 md_wakeup_thread(mddev->thread); 4678 wake_up(&mddev->sb_wait); 4679 } else /* st == clean */ { 4680 restart_array(mddev); 4681 if (!set_in_sync(mddev)) 4682 err = -EBUSY; 4683 } 4684 if (!err) 4685 sysfs_notify_dirent_safe(mddev->sysfs_state); 4686 spin_unlock(&mddev->lock); 4687 return err ?: len; 4688 } 4689 err = mddev_lock(mddev); 4690 if (err) 4691 return err; 4692 4693 switch (st) { 4694 case inactive: 4695 /* stop an active array, return 0 otherwise */ 4696 if (mddev->pers) 4697 err = do_md_stop(mddev, 2); 4698 break; 4699 case clear: 4700 err = do_md_stop(mddev, 0); 4701 break; 4702 case readonly: 4703 if (mddev->pers) 4704 err = md_set_readonly(mddev); 4705 else { 4706 mddev->ro = MD_RDONLY; 4707 set_disk_ro(mddev->gendisk, 1); 4708 err = do_md_run(mddev); 4709 } 4710 break; 4711 case read_auto: 4712 if (mddev->pers) { 4713 if (md_is_rdwr(mddev)) 4714 err = md_set_readonly(mddev); 4715 else if (mddev->ro == MD_RDONLY) 4716 err = restart_array(mddev); 4717 if (err == 0) { 4718 mddev->ro = MD_AUTO_READ; 4719 set_disk_ro(mddev->gendisk, 0); 4720 } 4721 } else { 4722 mddev->ro = MD_AUTO_READ; 4723 err = do_md_run(mddev); 4724 } 4725 break; 4726 case clean: 4727 if (mddev->pers) { 4728 err = restart_array(mddev); 4729 if (err) 4730 break; 4731 spin_lock(&mddev->lock); 4732 if (!set_in_sync(mddev)) 4733 err = -EBUSY; 4734 spin_unlock(&mddev->lock); 4735 } else 4736 err = -EINVAL; 4737 break; 4738 case active: 4739 if (mddev->pers) { 4740 err = restart_array(mddev); 4741 if (err) 4742 break; 4743 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4744 wake_up(&mddev->sb_wait); 4745 err = 0; 4746 } else { 4747 mddev->ro = MD_RDWR; 4748 set_disk_ro(mddev->gendisk, 0); 4749 err = do_md_run(mddev); 4750 } 4751 break; 4752 default: 4753 err = -EINVAL; 4754 break; 4755 } 4756 4757 if (!err) { 4758 if (mddev->hold_active == UNTIL_IOCTL) 4759 mddev->hold_active = 0; 4760 sysfs_notify_dirent_safe(mddev->sysfs_state); 4761 } 4762 mddev_unlock(mddev); 4763 4764 if (st == readonly || st == read_auto || st == inactive || 4765 (err && st == clear)) 4766 clear_bit(MD_CLOSING, &mddev->flags); 4767 4768 return err ?: len; 4769} 4770static struct md_sysfs_entry md_array_state = 4771__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4772 4773static ssize_t 4774max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4775 return sprintf(page, "%d\n", 4776 atomic_read(&mddev->max_corr_read_errors)); 4777} 4778 4779static ssize_t 4780max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4781{ 4782 unsigned int n; 4783 int rv; 4784 4785 rv = kstrtouint(buf, 10, &n); 4786 if (rv < 0) 4787 return rv; 4788 if (n > INT_MAX) 4789 return -EINVAL; 4790 atomic_set(&mddev->max_corr_read_errors, n); 4791 return len; 4792} 4793 4794static struct md_sysfs_entry max_corr_read_errors = 4795__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4796 max_corrected_read_errors_store); 4797 4798static ssize_t 4799null_show(struct mddev *mddev, char *page) 4800{ 4801 return -EINVAL; 4802} 4803 4804static ssize_t 4805new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4806{ 4807 /* buf must be %d:%d\n? giving major and minor numbers */ 4808 /* The new device is added to the array. 4809 * If the array has a persistent superblock, we read the 4810 * superblock to initialise info and check validity. 4811 * Otherwise, only checking done is that in bind_rdev_to_array, 4812 * which mainly checks size. 4813 */ 4814 char *e; 4815 int major = simple_strtoul(buf, &e, 10); 4816 int minor; 4817 dev_t dev; 4818 struct md_rdev *rdev; 4819 int err; 4820 4821 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4822 return -EINVAL; 4823 minor = simple_strtoul(e+1, &e, 10); 4824 if (*e && *e != '\n') 4825 return -EINVAL; 4826 dev = MKDEV(major, minor); 4827 if (major != MAJOR(dev) || 4828 minor != MINOR(dev)) 4829 return -EOVERFLOW; 4830 4831 err = mddev_suspend_and_lock(mddev); 4832 if (err) 4833 return err; 4834 if (mddev->persistent) { 4835 rdev = md_import_device(dev, mddev->major_version, 4836 mddev->minor_version); 4837 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4838 struct md_rdev *rdev0 4839 = list_entry(mddev->disks.next, 4840 struct md_rdev, same_set); 4841 err = super_types[mddev->major_version] 4842 .load_super(rdev, rdev0, mddev->minor_version); 4843 if (err < 0) 4844 goto out; 4845 } 4846 } else if (mddev->external) 4847 rdev = md_import_device(dev, -2, -1); 4848 else 4849 rdev = md_import_device(dev, -1, -1); 4850 4851 if (IS_ERR(rdev)) { 4852 mddev_unlock_and_resume(mddev); 4853 return PTR_ERR(rdev); 4854 } 4855 err = bind_rdev_to_array(rdev, mddev); 4856 out: 4857 if (err) 4858 export_rdev(rdev); 4859 mddev_unlock_and_resume(mddev); 4860 if (!err) 4861 md_new_event(); 4862 return err ? err : len; 4863} 4864 4865static struct md_sysfs_entry md_new_device = 4866__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4867 4868static ssize_t 4869bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4870{ 4871 char *end; 4872 unsigned long chunk, end_chunk; 4873 int err; 4874 4875 if (!md_bitmap_enabled(mddev, false)) 4876 return len; 4877 4878 err = mddev_lock(mddev); 4879 if (err) 4880 return err; 4881 if (!mddev->bitmap) 4882 goto out; 4883 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4884 while (*buf) { 4885 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4886 if (buf == end) 4887 break; 4888 4889 if (*end == '-') { /* range */ 4890 buf = end + 1; 4891 end_chunk = simple_strtoul(buf, &end, 0); 4892 if (buf == end) 4893 break; 4894 } 4895 4896 if (*end && !isspace(*end)) 4897 break; 4898 4899 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4900 buf = skip_spaces(end); 4901 } 4902 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4903out: 4904 mddev_unlock(mddev); 4905 return len; 4906} 4907 4908static struct md_sysfs_entry md_bitmap = 4909__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4910 4911static ssize_t 4912size_show(struct mddev *mddev, char *page) 4913{ 4914 return sprintf(page, "%llu\n", 4915 (unsigned long long)mddev->dev_sectors / 2); 4916} 4917 4918static int update_size(struct mddev *mddev, sector_t num_sectors); 4919 4920static ssize_t 4921size_store(struct mddev *mddev, const char *buf, size_t len) 4922{ 4923 /* If array is inactive, we can reduce the component size, but 4924 * not increase it (except from 0). 4925 * If array is active, we can try an on-line resize 4926 */ 4927 sector_t sectors; 4928 int err = strict_blocks_to_sectors(buf, &sectors); 4929 4930 if (err < 0) 4931 return err; 4932 err = mddev_lock(mddev); 4933 if (err) 4934 return err; 4935 if (mddev->pers) { 4936 err = update_size(mddev, sectors); 4937 if (err == 0) 4938 md_update_sb(mddev, 1); 4939 } else { 4940 if (mddev->dev_sectors == 0 || 4941 mddev->dev_sectors > sectors) 4942 mddev->dev_sectors = sectors; 4943 else 4944 err = -ENOSPC; 4945 } 4946 mddev_unlock(mddev); 4947 return err ? err : len; 4948} 4949 4950static struct md_sysfs_entry md_size = 4951__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4952 4953/* Metadata version. 4954 * This is one of 4955 * 'none' for arrays with no metadata (good luck...) 4956 * 'external' for arrays with externally managed metadata, 4957 * or N.M for internally known formats 4958 */ 4959static ssize_t 4960metadata_show(struct mddev *mddev, char *page) 4961{ 4962 if (mddev->persistent) 4963 return sprintf(page, "%d.%d\n", 4964 mddev->major_version, mddev->minor_version); 4965 else if (mddev->external) 4966 return sprintf(page, "external:%s\n", mddev->metadata_type); 4967 else 4968 return sprintf(page, "none\n"); 4969} 4970 4971static ssize_t 4972metadata_store(struct mddev *mddev, const char *buf, size_t len) 4973{ 4974 int major, minor; 4975 char *e; 4976 int err; 4977 /* Changing the details of 'external' metadata is 4978 * always permitted. Otherwise there must be 4979 * no devices attached to the array. 4980 */ 4981 4982 err = mddev_lock(mddev); 4983 if (err) 4984 return err; 4985 err = -EBUSY; 4986 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4987 ; 4988 else if (!list_empty(&mddev->disks)) 4989 goto out_unlock; 4990 4991 err = 0; 4992 if (cmd_match(buf, "none")) { 4993 mddev->persistent = 0; 4994 mddev->external = 0; 4995 mddev->major_version = 0; 4996 mddev->minor_version = 90; 4997 goto out_unlock; 4998 } 4999 if (strncmp(buf, "external:", 9) == 0) { 5000 size_t namelen = len-9; 5001 if (namelen >= sizeof(mddev->metadata_type)) 5002 namelen = sizeof(mddev->metadata_type)-1; 5003 memcpy(mddev->metadata_type, buf+9, namelen); 5004 mddev->metadata_type[namelen] = 0; 5005 if (namelen && mddev->metadata_type[namelen-1] == '\n') 5006 mddev->metadata_type[--namelen] = 0; 5007 mddev->persistent = 0; 5008 mddev->external = 1; 5009 mddev->major_version = 0; 5010 mddev->minor_version = 90; 5011 goto out_unlock; 5012 } 5013 major = simple_strtoul(buf, &e, 10); 5014 err = -EINVAL; 5015 if (e==buf || *e != '.') 5016 goto out_unlock; 5017 buf = e+1; 5018 minor = simple_strtoul(buf, &e, 10); 5019 if (e==buf || (*e && *e != '\n') ) 5020 goto out_unlock; 5021 err = -ENOENT; 5022 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 5023 goto out_unlock; 5024 mddev->major_version = major; 5025 mddev->minor_version = minor; 5026 mddev->persistent = 1; 5027 mddev->external = 0; 5028 err = 0; 5029out_unlock: 5030 mddev_unlock(mddev); 5031 return err ?: len; 5032} 5033 5034static struct md_sysfs_entry md_metadata = 5035__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 5036 5037static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) 5038{ 5039 return rdev->raid_disk >= 0 && 5040 !test_bit(Journal, &rdev->flags) && 5041 !test_bit(Faulty, &rdev->flags) && 5042 !test_bit(In_sync, &rdev->flags) && 5043 rdev->recovery_offset < sectors; 5044} 5045 5046static enum sync_action md_get_active_sync_action(struct mddev *mddev) 5047{ 5048 struct md_rdev *rdev; 5049 bool is_recover = false; 5050 5051 if (mddev->resync_offset < MaxSector) 5052 return ACTION_RESYNC; 5053 5054 if (mddev->reshape_position != MaxSector) 5055 return ACTION_RESHAPE; 5056 5057 rcu_read_lock(); 5058 rdev_for_each_rcu(rdev, mddev) { 5059 if (rdev_needs_recovery(rdev, MaxSector)) { 5060 is_recover = true; 5061 break; 5062 } 5063 } 5064 rcu_read_unlock(); 5065 5066 return is_recover ? ACTION_RECOVER : ACTION_IDLE; 5067} 5068 5069enum sync_action md_sync_action(struct mddev *mddev) 5070{ 5071 unsigned long recovery = mddev->recovery; 5072 enum sync_action active_action; 5073 5074 /* 5075 * frozen has the highest priority, means running sync_thread will be 5076 * stopped immediately, and no new sync_thread can start. 5077 */ 5078 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 5079 return ACTION_FROZEN; 5080 5081 /* 5082 * read-only array can't register sync_thread, and it can only 5083 * add/remove spares. 5084 */ 5085 if (!md_is_rdwr(mddev)) 5086 return ACTION_IDLE; 5087 5088 /* 5089 * idle means no sync_thread is running, and no new sync_thread is 5090 * requested. 5091 */ 5092 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 5093 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 5094 return ACTION_IDLE; 5095 5096 /* 5097 * Check if any sync operation (resync/recover/reshape) is 5098 * currently active. This ensures that only one sync operation 5099 * can run at a time. Returns the type of active operation, or 5100 * ACTION_IDLE if none are active. 5101 */ 5102 active_action = md_get_active_sync_action(mddev); 5103 if (active_action != ACTION_IDLE) 5104 return active_action; 5105 5106 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 5107 return ACTION_RESHAPE; 5108 5109 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 5110 return ACTION_RECOVER; 5111 5112 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 5113 /* 5114 * MD_RECOVERY_CHECK must be paired with 5115 * MD_RECOVERY_REQUESTED. 5116 */ 5117 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 5118 return ACTION_CHECK; 5119 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 5120 return ACTION_REPAIR; 5121 return ACTION_RESYNC; 5122 } 5123 5124 /* 5125 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 5126 * sync_action is specified. 5127 */ 5128 return ACTION_IDLE; 5129} 5130 5131enum sync_action md_sync_action_by_name(const char *page) 5132{ 5133 enum sync_action action; 5134 5135 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 5136 if (cmd_match(page, action_name[action])) 5137 return action; 5138 } 5139 5140 return NR_SYNC_ACTIONS; 5141} 5142 5143const char *md_sync_action_name(enum sync_action action) 5144{ 5145 return action_name[action]; 5146} 5147 5148static ssize_t 5149action_show(struct mddev *mddev, char *page) 5150{ 5151 enum sync_action action = md_sync_action(mddev); 5152 5153 return sprintf(page, "%s\n", md_sync_action_name(action)); 5154} 5155 5156/** 5157 * stop_sync_thread() - wait for sync_thread to stop if it's running. 5158 * @mddev: the array. 5159 * @locked: if set, reconfig_mutex will still be held after this function 5160 * return; if not set, reconfig_mutex will be released after this 5161 * function return. 5162 */ 5163static void stop_sync_thread(struct mddev *mddev, bool locked) 5164{ 5165 int sync_seq = atomic_read(&mddev->sync_seq); 5166 5167 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5168 if (!locked) 5169 mddev_unlock(mddev); 5170 return; 5171 } 5172 5173 mddev_unlock(mddev); 5174 5175 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5176 /* 5177 * Thread might be blocked waiting for metadata update which will now 5178 * never happen 5179 */ 5180 md_wakeup_thread_directly(&mddev->sync_thread); 5181 if (work_pending(&mddev->sync_work)) 5182 flush_work(&mddev->sync_work); 5183 5184 wait_event(resync_wait, 5185 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5186 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 5187 sync_seq != atomic_read(&mddev->sync_seq))); 5188 5189 if (locked) 5190 mddev_lock_nointr(mddev); 5191} 5192 5193void md_idle_sync_thread(struct mddev *mddev) 5194{ 5195 lockdep_assert_held(&mddev->reconfig_mutex); 5196 5197 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5198 stop_sync_thread(mddev, true); 5199} 5200EXPORT_SYMBOL_GPL(md_idle_sync_thread); 5201 5202void md_frozen_sync_thread(struct mddev *mddev) 5203{ 5204 lockdep_assert_held(&mddev->reconfig_mutex); 5205 5206 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5207 stop_sync_thread(mddev, true); 5208} 5209EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 5210 5211void md_unfrozen_sync_thread(struct mddev *mddev) 5212{ 5213 lockdep_assert_held(&mddev->reconfig_mutex); 5214 5215 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5216 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5217 md_wakeup_thread(mddev->thread); 5218 sysfs_notify_dirent_safe(mddev->sysfs_action); 5219} 5220EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 5221 5222static int mddev_start_reshape(struct mddev *mddev) 5223{ 5224 int ret; 5225 5226 if (mddev->pers->start_reshape == NULL) 5227 return -EINVAL; 5228 5229 if (mddev->reshape_position == MaxSector || 5230 mddev->pers->check_reshape == NULL || 5231 mddev->pers->check_reshape(mddev)) { 5232 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5233 ret = mddev->pers->start_reshape(mddev); 5234 if (ret) 5235 return ret; 5236 } else { 5237 /* 5238 * If reshape is still in progress, and md_check_recovery() can 5239 * continue to reshape, don't restart reshape because data can 5240 * be corrupted for raid456. 5241 */ 5242 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5243 } 5244 5245 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 5246 return 0; 5247} 5248 5249static ssize_t 5250action_store(struct mddev *mddev, const char *page, size_t len) 5251{ 5252 int ret; 5253 enum sync_action action; 5254 5255 if (!mddev->pers || !mddev->pers->sync_request) 5256 return -EINVAL; 5257 5258retry: 5259 if (work_busy(&mddev->sync_work)) 5260 flush_work(&mddev->sync_work); 5261 5262 ret = mddev_lock(mddev); 5263 if (ret) 5264 return ret; 5265 5266 if (work_busy(&mddev->sync_work)) { 5267 mddev_unlock(mddev); 5268 goto retry; 5269 } 5270 5271 action = md_sync_action_by_name(page); 5272 5273 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5274 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5275 switch (action) { 5276 case ACTION_FROZEN: 5277 md_frozen_sync_thread(mddev); 5278 ret = len; 5279 goto out; 5280 case ACTION_IDLE: 5281 md_idle_sync_thread(mddev); 5282 break; 5283 case ACTION_RESHAPE: 5284 case ACTION_RECOVER: 5285 case ACTION_CHECK: 5286 case ACTION_REPAIR: 5287 case ACTION_RESYNC: 5288 ret = -EBUSY; 5289 goto out; 5290 default: 5291 ret = -EINVAL; 5292 goto out; 5293 } 5294 } else { 5295 switch (action) { 5296 case ACTION_FROZEN: 5297 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5298 ret = len; 5299 goto out; 5300 case ACTION_RESHAPE: 5301 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5302 ret = mddev_start_reshape(mddev); 5303 if (ret) 5304 goto out; 5305 break; 5306 case ACTION_RECOVER: 5307 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5308 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5309 break; 5310 case ACTION_CHECK: 5311 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5312 fallthrough; 5313 case ACTION_REPAIR: 5314 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5315 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5316 fallthrough; 5317 case ACTION_RESYNC: 5318 case ACTION_IDLE: 5319 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5320 break; 5321 default: 5322 ret = -EINVAL; 5323 goto out; 5324 } 5325 } 5326 5327 if (mddev->ro == MD_AUTO_READ) { 5328 /* A write to sync_action is enough to justify 5329 * canceling read-auto mode 5330 */ 5331 mddev->ro = MD_RDWR; 5332 md_wakeup_thread(mddev->sync_thread); 5333 } 5334 5335 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5336 md_wakeup_thread(mddev->thread); 5337 sysfs_notify_dirent_safe(mddev->sysfs_action); 5338 ret = len; 5339 5340out: 5341 mddev_unlock(mddev); 5342 return ret; 5343} 5344 5345static struct md_sysfs_entry md_scan_mode = 5346__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5347 5348static ssize_t 5349last_sync_action_show(struct mddev *mddev, char *page) 5350{ 5351 return sprintf(page, "%s\n", 5352 md_sync_action_name(mddev->last_sync_action)); 5353} 5354 5355static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5356 5357static ssize_t 5358mismatch_cnt_show(struct mddev *mddev, char *page) 5359{ 5360 return sprintf(page, "%llu\n", 5361 (unsigned long long) 5362 atomic64_read(&mddev->resync_mismatches)); 5363} 5364 5365static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5366 5367static ssize_t 5368sync_min_show(struct mddev *mddev, char *page) 5369{ 5370 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5371 mddev->sync_speed_min ? "local" : "system"); 5372} 5373 5374static ssize_t 5375sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5376{ 5377 unsigned int min; 5378 int rv; 5379 5380 if (strncmp(buf, "system", 6) == 0) { 5381 min = 0; 5382 } else { 5383 rv = kstrtouint(buf, 10, &min); 5384 if (rv < 0) 5385 return rv; 5386 if (min == 0) 5387 return -EINVAL; 5388 } 5389 mddev->sync_speed_min = min; 5390 return len; 5391} 5392 5393static struct md_sysfs_entry md_sync_min = 5394__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5395 5396static ssize_t 5397sync_max_show(struct mddev *mddev, char *page) 5398{ 5399 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5400 mddev->sync_speed_max ? "local" : "system"); 5401} 5402 5403static ssize_t 5404sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5405{ 5406 unsigned int max; 5407 int rv; 5408 5409 if (strncmp(buf, "system", 6) == 0) { 5410 max = 0; 5411 } else { 5412 rv = kstrtouint(buf, 10, &max); 5413 if (rv < 0) 5414 return rv; 5415 if (max == 0) 5416 return -EINVAL; 5417 } 5418 mddev->sync_speed_max = max; 5419 return len; 5420} 5421 5422static struct md_sysfs_entry md_sync_max = 5423__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5424 5425static ssize_t 5426sync_io_depth_show(struct mddev *mddev, char *page) 5427{ 5428 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), 5429 mddev->sync_io_depth ? "local" : "system"); 5430} 5431 5432static ssize_t 5433sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) 5434{ 5435 unsigned int max; 5436 int rv; 5437 5438 if (strncmp(buf, "system", 6) == 0) { 5439 max = 0; 5440 } else { 5441 rv = kstrtouint(buf, 10, &max); 5442 if (rv < 0) 5443 return rv; 5444 if (max == 0) 5445 return -EINVAL; 5446 } 5447 mddev->sync_io_depth = max; 5448 return len; 5449} 5450 5451static struct md_sysfs_entry md_sync_io_depth = 5452__ATTR_RW(sync_io_depth); 5453 5454static ssize_t 5455degraded_show(struct mddev *mddev, char *page) 5456{ 5457 return sprintf(page, "%d\n", mddev->degraded); 5458} 5459static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5460 5461static ssize_t 5462sync_force_parallel_show(struct mddev *mddev, char *page) 5463{ 5464 return sprintf(page, "%d\n", mddev->parallel_resync); 5465} 5466 5467static ssize_t 5468sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5469{ 5470 long n; 5471 5472 if (kstrtol(buf, 10, &n)) 5473 return -EINVAL; 5474 5475 if (n != 0 && n != 1) 5476 return -EINVAL; 5477 5478 mddev->parallel_resync = n; 5479 5480 if (mddev->sync_thread) 5481 wake_up(&resync_wait); 5482 5483 return len; 5484} 5485 5486/* force parallel resync, even with shared block devices */ 5487static struct md_sysfs_entry md_sync_force_parallel = 5488__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5489 sync_force_parallel_show, sync_force_parallel_store); 5490 5491static ssize_t 5492sync_speed_show(struct mddev *mddev, char *page) 5493{ 5494 unsigned long resync, dt, db; 5495 if (mddev->curr_resync == MD_RESYNC_NONE) 5496 return sprintf(page, "none\n"); 5497 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5498 dt = (jiffies - mddev->resync_mark) / HZ; 5499 if (!dt) dt++; 5500 db = resync - mddev->resync_mark_cnt; 5501 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5502} 5503 5504static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5505 5506static ssize_t 5507sync_completed_show(struct mddev *mddev, char *page) 5508{ 5509 unsigned long long max_sectors, resync; 5510 5511 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5512 return sprintf(page, "none\n"); 5513 5514 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5515 mddev->curr_resync == MD_RESYNC_DELAYED) 5516 return sprintf(page, "delayed\n"); 5517 5518 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5519 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5520 max_sectors = mddev->resync_max_sectors; 5521 else 5522 max_sectors = mddev->dev_sectors; 5523 5524 resync = mddev->curr_resync_completed; 5525 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5526} 5527 5528static struct md_sysfs_entry md_sync_completed = 5529 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5530 5531static ssize_t 5532min_sync_show(struct mddev *mddev, char *page) 5533{ 5534 return sprintf(page, "%llu\n", 5535 (unsigned long long)mddev->resync_min); 5536} 5537static ssize_t 5538min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5539{ 5540 unsigned long long min; 5541 int err; 5542 5543 if (kstrtoull(buf, 10, &min)) 5544 return -EINVAL; 5545 5546 spin_lock(&mddev->lock); 5547 err = -EINVAL; 5548 if (min > mddev->resync_max) 5549 goto out_unlock; 5550 5551 err = -EBUSY; 5552 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5553 goto out_unlock; 5554 5555 /* Round down to multiple of 4K for safety */ 5556 mddev->resync_min = round_down(min, 8); 5557 err = 0; 5558 5559out_unlock: 5560 spin_unlock(&mddev->lock); 5561 return err ?: len; 5562} 5563 5564static struct md_sysfs_entry md_min_sync = 5565__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5566 5567static ssize_t 5568max_sync_show(struct mddev *mddev, char *page) 5569{ 5570 if (mddev->resync_max == MaxSector) 5571 return sprintf(page, "max\n"); 5572 else 5573 return sprintf(page, "%llu\n", 5574 (unsigned long long)mddev->resync_max); 5575} 5576static ssize_t 5577max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5578{ 5579 int err; 5580 spin_lock(&mddev->lock); 5581 if (strncmp(buf, "max", 3) == 0) 5582 mddev->resync_max = MaxSector; 5583 else { 5584 unsigned long long max; 5585 int chunk; 5586 5587 err = -EINVAL; 5588 if (kstrtoull(buf, 10, &max)) 5589 goto out_unlock; 5590 if (max < mddev->resync_min) 5591 goto out_unlock; 5592 5593 err = -EBUSY; 5594 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5595 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5596 goto out_unlock; 5597 5598 /* Must be a multiple of chunk_size */ 5599 chunk = mddev->chunk_sectors; 5600 if (chunk) { 5601 sector_t temp = max; 5602 5603 err = -EINVAL; 5604 if (sector_div(temp, chunk)) 5605 goto out_unlock; 5606 } 5607 mddev->resync_max = max; 5608 } 5609 wake_up(&mddev->recovery_wait); 5610 err = 0; 5611out_unlock: 5612 spin_unlock(&mddev->lock); 5613 return err ?: len; 5614} 5615 5616static struct md_sysfs_entry md_max_sync = 5617__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5618 5619static ssize_t 5620suspend_lo_show(struct mddev *mddev, char *page) 5621{ 5622 return sprintf(page, "%llu\n", 5623 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5624} 5625 5626static ssize_t 5627suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5628{ 5629 unsigned long long new; 5630 int err; 5631 5632 err = kstrtoull(buf, 10, &new); 5633 if (err < 0) 5634 return err; 5635 if (new != (sector_t)new) 5636 return -EINVAL; 5637 5638 err = mddev_suspend(mddev, true); 5639 if (err) 5640 return err; 5641 5642 WRITE_ONCE(mddev->suspend_lo, new); 5643 mddev_resume(mddev); 5644 5645 return len; 5646} 5647static struct md_sysfs_entry md_suspend_lo = 5648__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5649 5650static ssize_t 5651suspend_hi_show(struct mddev *mddev, char *page) 5652{ 5653 return sprintf(page, "%llu\n", 5654 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5655} 5656 5657static ssize_t 5658suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5659{ 5660 unsigned long long new; 5661 int err; 5662 5663 err = kstrtoull(buf, 10, &new); 5664 if (err < 0) 5665 return err; 5666 if (new != (sector_t)new) 5667 return -EINVAL; 5668 5669 err = mddev_suspend(mddev, true); 5670 if (err) 5671 return err; 5672 5673 WRITE_ONCE(mddev->suspend_hi, new); 5674 mddev_resume(mddev); 5675 5676 return len; 5677} 5678static struct md_sysfs_entry md_suspend_hi = 5679__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5680 5681static ssize_t 5682reshape_position_show(struct mddev *mddev, char *page) 5683{ 5684 if (mddev->reshape_position != MaxSector) 5685 return sprintf(page, "%llu\n", 5686 (unsigned long long)mddev->reshape_position); 5687 strcpy(page, "none\n"); 5688 return 5; 5689} 5690 5691static ssize_t 5692reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5693{ 5694 struct md_rdev *rdev; 5695 unsigned long long new; 5696 int err; 5697 5698 err = kstrtoull(buf, 10, &new); 5699 if (err < 0) 5700 return err; 5701 if (new != (sector_t)new) 5702 return -EINVAL; 5703 err = mddev_lock(mddev); 5704 if (err) 5705 return err; 5706 err = -EBUSY; 5707 if (mddev->pers) 5708 goto unlock; 5709 mddev->reshape_position = new; 5710 mddev->delta_disks = 0; 5711 mddev->reshape_backwards = 0; 5712 mddev->new_level = mddev->level; 5713 mddev->new_layout = mddev->layout; 5714 mddev->new_chunk_sectors = mddev->chunk_sectors; 5715 rdev_for_each(rdev, mddev) 5716 rdev->new_data_offset = rdev->data_offset; 5717 err = 0; 5718unlock: 5719 mddev_unlock(mddev); 5720 return err ?: len; 5721} 5722 5723static struct md_sysfs_entry md_reshape_position = 5724__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5725 reshape_position_store); 5726 5727static ssize_t 5728reshape_direction_show(struct mddev *mddev, char *page) 5729{ 5730 return sprintf(page, "%s\n", 5731 mddev->reshape_backwards ? "backwards" : "forwards"); 5732} 5733 5734static ssize_t 5735reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5736{ 5737 int backwards = 0; 5738 int err; 5739 5740 if (cmd_match(buf, "forwards")) 5741 backwards = 0; 5742 else if (cmd_match(buf, "backwards")) 5743 backwards = 1; 5744 else 5745 return -EINVAL; 5746 if (mddev->reshape_backwards == backwards) 5747 return len; 5748 5749 err = mddev_lock(mddev); 5750 if (err) 5751 return err; 5752 /* check if we are allowed to change */ 5753 if (mddev->delta_disks) 5754 err = -EBUSY; 5755 else if (mddev->persistent && 5756 mddev->major_version == 0) 5757 err = -EINVAL; 5758 else 5759 mddev->reshape_backwards = backwards; 5760 mddev_unlock(mddev); 5761 return err ?: len; 5762} 5763 5764static struct md_sysfs_entry md_reshape_direction = 5765__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5766 reshape_direction_store); 5767 5768static ssize_t 5769array_size_show(struct mddev *mddev, char *page) 5770{ 5771 if (mddev->external_size) 5772 return sprintf(page, "%llu\n", 5773 (unsigned long long)mddev->array_sectors/2); 5774 else 5775 return sprintf(page, "default\n"); 5776} 5777 5778static ssize_t 5779array_size_store(struct mddev *mddev, const char *buf, size_t len) 5780{ 5781 sector_t sectors; 5782 int err; 5783 5784 err = mddev_lock(mddev); 5785 if (err) 5786 return err; 5787 5788 /* cluster raid doesn't support change array_sectors */ 5789 if (mddev_is_clustered(mddev)) { 5790 mddev_unlock(mddev); 5791 return -EINVAL; 5792 } 5793 5794 if (strncmp(buf, "default", 7) == 0) { 5795 if (mddev->pers) 5796 sectors = mddev->pers->size(mddev, 0, 0); 5797 else 5798 sectors = mddev->array_sectors; 5799 5800 mddev->external_size = 0; 5801 } else { 5802 if (strict_blocks_to_sectors(buf, &sectors) < 0) 5803 err = -EINVAL; 5804 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5805 err = -E2BIG; 5806 else 5807 mddev->external_size = 1; 5808 } 5809 5810 if (!err) { 5811 mddev->array_sectors = sectors; 5812 if (mddev->pers) 5813 set_capacity_and_notify(mddev->gendisk, 5814 mddev->array_sectors); 5815 } 5816 mddev_unlock(mddev); 5817 return err ?: len; 5818} 5819 5820static struct md_sysfs_entry md_array_size = 5821__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5822 array_size_store); 5823 5824static ssize_t 5825consistency_policy_show(struct mddev *mddev, char *page) 5826{ 5827 int ret; 5828 5829 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5830 ret = sprintf(page, "journal\n"); 5831 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5832 ret = sprintf(page, "ppl\n"); 5833 } else if (mddev->bitmap) { 5834 ret = sprintf(page, "bitmap\n"); 5835 } else if (mddev->pers) { 5836 if (mddev->pers->sync_request) 5837 ret = sprintf(page, "resync\n"); 5838 else 5839 ret = sprintf(page, "none\n"); 5840 } else { 5841 ret = sprintf(page, "unknown\n"); 5842 } 5843 5844 return ret; 5845} 5846 5847static ssize_t 5848consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5849{ 5850 int err = 0; 5851 5852 if (mddev->pers) { 5853 if (mddev->pers->change_consistency_policy) 5854 err = mddev->pers->change_consistency_policy(mddev, buf); 5855 else 5856 err = -EBUSY; 5857 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5858 set_bit(MD_HAS_PPL, &mddev->flags); 5859 } else { 5860 err = -EINVAL; 5861 } 5862 5863 return err ? err : len; 5864} 5865 5866static struct md_sysfs_entry md_consistency_policy = 5867__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5868 consistency_policy_store); 5869 5870static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5871{ 5872 return sprintf(page, "%d\n", test_bit(MD_FAILLAST_DEV, &mddev->flags)); 5873} 5874 5875/* 5876 * Setting MD_FAILLAST_DEV to allow last device to be forcibly removed 5877 * from RAID1/RAID10. 5878 */ 5879static ssize_t 5880fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5881{ 5882 int ret; 5883 bool value; 5884 5885 ret = kstrtobool(buf, &value); 5886 if (ret) 5887 return ret; 5888 5889 if (value) 5890 set_bit(MD_FAILLAST_DEV, &mddev->flags); 5891 else 5892 clear_bit(MD_FAILLAST_DEV, &mddev->flags); 5893 5894 return len; 5895} 5896static struct md_sysfs_entry md_fail_last_dev = 5897__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5898 fail_last_dev_store); 5899 5900static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5901{ 5902 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) 5903 return sprintf(page, "n/a\n"); 5904 else 5905 return sprintf(page, "%d\n", 5906 test_bit(MD_SERIALIZE_POLICY, &mddev->flags)); 5907} 5908 5909/* 5910 * Setting MD_SERIALIZE_POLICY enforce write IO is not reordered 5911 * for raid1. 5912 */ 5913static ssize_t 5914serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5915{ 5916 int err; 5917 bool value; 5918 5919 err = kstrtobool(buf, &value); 5920 if (err) 5921 return err; 5922 5923 if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) 5924 return len; 5925 5926 err = mddev_suspend_and_lock(mddev); 5927 if (err) 5928 return err; 5929 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { 5930 pr_err("md: serialize_policy is only effective for raid1\n"); 5931 err = -EINVAL; 5932 goto unlock; 5933 } 5934 5935 if (value) { 5936 mddev_create_serial_pool(mddev, NULL); 5937 set_bit(MD_SERIALIZE_POLICY, &mddev->flags); 5938 } else { 5939 mddev_destroy_serial_pool(mddev, NULL); 5940 clear_bit(MD_SERIALIZE_POLICY, &mddev->flags); 5941 } 5942unlock: 5943 mddev_unlock_and_resume(mddev); 5944 return err ?: len; 5945} 5946 5947static struct md_sysfs_entry md_serialize_policy = 5948__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5949 serialize_policy_store); 5950 5951static int mddev_set_logical_block_size(struct mddev *mddev, 5952 unsigned int lbs) 5953{ 5954 int err = 0; 5955 struct queue_limits lim; 5956 5957 if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { 5958 pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", 5959 mdname(mddev), lbs); 5960 return -EINVAL; 5961 } 5962 5963 lim = queue_limits_start_update(mddev->gendisk->queue); 5964 lim.logical_block_size = lbs; 5965 pr_info("%s: logical_block_size is changed, data may be lost\n", 5966 mdname(mddev)); 5967 err = queue_limits_commit_update(mddev->gendisk->queue, &lim); 5968 if (err) 5969 return err; 5970 5971 mddev->logical_block_size = lbs; 5972 /* New lbs will be written to superblock after array is running */ 5973 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5974 return 0; 5975} 5976 5977static ssize_t 5978lbs_show(struct mddev *mddev, char *page) 5979{ 5980 return sprintf(page, "%u\n", mddev->logical_block_size); 5981} 5982 5983static ssize_t 5984lbs_store(struct mddev *mddev, const char *buf, size_t len) 5985{ 5986 unsigned int lbs; 5987 int err = -EBUSY; 5988 5989 /* Only 1.x meta supports configurable LBS */ 5990 if (mddev->major_version == 0) 5991 return -EINVAL; 5992 5993 err = kstrtouint(buf, 10, &lbs); 5994 if (err < 0) 5995 return -EINVAL; 5996 5997 if (mddev->pers) { 5998 unsigned int curr_lbs; 5999 6000 if (mddev->logical_block_size) 6001 return -EBUSY; 6002 /* 6003 * To fix forward compatibility issues, LBS is not 6004 * configured for arrays from old kernels (<=6.18) by default. 6005 * If the user confirms no rollback to old kernels, 6006 * enable LBS by writing current LBS — to prevent data 6007 * loss from LBS changes. 6008 */ 6009 curr_lbs = queue_logical_block_size(mddev->gendisk->queue); 6010 if (lbs != curr_lbs) 6011 return -EINVAL; 6012 6013 mddev->logical_block_size = curr_lbs; 6014 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6015 pr_info("%s: logical block size configured successfully, array will not be assembled in old kernels (<= 6.18)\n", 6016 mdname(mddev)); 6017 return len; 6018 } 6019 6020 err = mddev_lock(mddev); 6021 if (err) 6022 goto unlock; 6023 6024 err = mddev_set_logical_block_size(mddev, lbs); 6025 6026unlock: 6027 mddev_unlock(mddev); 6028 return err ?: len; 6029} 6030 6031static struct md_sysfs_entry md_logical_block_size = 6032__ATTR(logical_block_size, 0644, lbs_show, lbs_store); 6033 6034static struct attribute *md_default_attrs[] = { 6035 &md_level.attr, 6036 &md_new_level.attr, 6037 &md_bitmap_type.attr, 6038 &md_layout.attr, 6039 &md_raid_disks.attr, 6040 &md_uuid.attr, 6041 &md_chunk_size.attr, 6042 &md_size.attr, 6043 &md_resync_start.attr, 6044 &md_metadata.attr, 6045 &md_new_device.attr, 6046 &md_safe_delay.attr, 6047 &md_array_state.attr, 6048 &md_reshape_position.attr, 6049 &md_reshape_direction.attr, 6050 &md_array_size.attr, 6051 &max_corr_read_errors.attr, 6052 &md_consistency_policy.attr, 6053 &md_fail_last_dev.attr, 6054 &md_serialize_policy.attr, 6055 &md_logical_block_size.attr, 6056 NULL, 6057}; 6058ATTRIBUTE_GROUPS(md_default); 6059 6060static struct attribute *md_redundancy_attrs[] = { 6061 &md_scan_mode.attr, 6062 &md_last_scan_mode.attr, 6063 &md_mismatches.attr, 6064 &md_sync_min.attr, 6065 &md_sync_max.attr, 6066 &md_sync_io_depth.attr, 6067 &md_sync_speed.attr, 6068 &md_sync_force_parallel.attr, 6069 &md_sync_completed.attr, 6070 &md_min_sync.attr, 6071 &md_max_sync.attr, 6072 &md_suspend_lo.attr, 6073 &md_suspend_hi.attr, 6074 &md_bitmap.attr, 6075 &md_degraded.attr, 6076 NULL, 6077}; 6078static const struct attribute_group md_redundancy_group = { 6079 .name = NULL, 6080 .attrs = md_redundancy_attrs, 6081}; 6082 6083static ssize_t 6084md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 6085{ 6086 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 6087 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 6088 ssize_t rv; 6089 6090 if (!entry->show) 6091 return -EIO; 6092 spin_lock(&all_mddevs_lock); 6093 if (!mddev_get(mddev)) { 6094 spin_unlock(&all_mddevs_lock); 6095 return -EBUSY; 6096 } 6097 spin_unlock(&all_mddevs_lock); 6098 6099 rv = entry->show(mddev, page); 6100 mddev_put(mddev); 6101 return rv; 6102} 6103 6104static ssize_t 6105md_attr_store(struct kobject *kobj, struct attribute *attr, 6106 const char *page, size_t length) 6107{ 6108 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 6109 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 6110 ssize_t rv; 6111 struct kernfs_node *kn = NULL; 6112 6113 if (!entry->store) 6114 return -EIO; 6115 if (!capable(CAP_SYS_ADMIN)) 6116 return -EACCES; 6117 6118 if (entry->store == array_state_store && cmd_match(page, "clear")) 6119 kn = sysfs_break_active_protection(kobj, attr); 6120 6121 spin_lock(&all_mddevs_lock); 6122 if (!mddev_get(mddev)) { 6123 spin_unlock(&all_mddevs_lock); 6124 if (kn) 6125 sysfs_unbreak_active_protection(kn); 6126 return -EBUSY; 6127 } 6128 spin_unlock(&all_mddevs_lock); 6129 rv = entry->store(mddev, page, length); 6130 6131 /* 6132 * For "array_state=clear", dropping the extra kobject reference from 6133 * sysfs_break_active_protection() can trigger md kobject deletion. 6134 * Restore active protection before mddev_put() so deletion happens 6135 * after the sysfs write path fully unwinds. 6136 */ 6137 if (kn) 6138 sysfs_unbreak_active_protection(kn); 6139 mddev_put(mddev); 6140 6141 return rv; 6142} 6143 6144static void md_kobj_release(struct kobject *ko) 6145{ 6146 struct mddev *mddev = container_of(ko, struct mddev, kobj); 6147 6148 if (legacy_async_del_gendisk) { 6149 if (mddev->sysfs_state) 6150 sysfs_put(mddev->sysfs_state); 6151 if (mddev->sysfs_level) 6152 sysfs_put(mddev->sysfs_level); 6153 del_gendisk(mddev->gendisk); 6154 } 6155 put_disk(mddev->gendisk); 6156} 6157 6158static const struct sysfs_ops md_sysfs_ops = { 6159 .show = md_attr_show, 6160 .store = md_attr_store, 6161}; 6162static const struct kobj_type md_ktype = { 6163 .release = md_kobj_release, 6164 .sysfs_ops = &md_sysfs_ops, 6165 .default_groups = md_default_groups, 6166}; 6167 6168int mdp_major = 0; 6169 6170/* stack the limit for all rdevs into lim */ 6171int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 6172 unsigned int flags) 6173{ 6174 struct md_rdev *rdev; 6175 6176 rdev_for_each(rdev, mddev) { 6177 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 6178 mddev->gendisk->disk_name); 6179 if ((flags & MDDEV_STACK_INTEGRITY) && 6180 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 6181 return -EINVAL; 6182 } 6183 6184 /* 6185 * Before RAID adding folio support, the logical_block_size 6186 * should be smaller than the page size. 6187 */ 6188 if (lim->logical_block_size > PAGE_SIZE) { 6189 pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", 6190 mdname(mddev)); 6191 return -EINVAL; 6192 } 6193 6194 /* Only 1.x meta needs to set logical block size */ 6195 if (mddev->major_version == 0) 6196 return 0; 6197 6198 /* 6199 * Fix forward compatibility issue. Only set LBS by default for 6200 * new arrays, mddev->events == 0 indicates the array was just 6201 * created. When assembling an array, read LBS from the superblock 6202 * instead — LBS is 0 in superblocks created by old kernels. 6203 */ 6204 if (!mddev->events) { 6205 pr_info("%s: array will not be assembled in old kernels that lack configurable LBS support (<= 6.18)\n", 6206 mdname(mddev)); 6207 mddev->logical_block_size = lim->logical_block_size; 6208 } 6209 6210 if (!mddev->logical_block_size) 6211 pr_warn("%s: echo current LBS to md/logical_block_size to prevent data loss issues from LBS changes.\n" 6212 "\tNote: After setting, array will not be assembled in old kernels (<= 6.18)\n", 6213 mdname(mddev)); 6214 6215 return 0; 6216} 6217EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 6218 6219/* apply the extra stacking limits from a new rdev into mddev */ 6220int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 6221{ 6222 struct queue_limits lim; 6223 6224 if (mddev_is_dm(mddev)) 6225 return 0; 6226 6227 if (queue_logical_block_size(rdev->bdev->bd_disk->queue) > 6228 queue_logical_block_size(mddev->gendisk->queue)) { 6229 pr_err("%s: incompatible logical_block_size, can not add\n", 6230 mdname(mddev)); 6231 return -EINVAL; 6232 } 6233 6234 lim = queue_limits_start_update(mddev->gendisk->queue); 6235 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 6236 mddev->gendisk->disk_name); 6237 6238 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 6239 pr_err("%s: incompatible integrity profile for %pg\n", 6240 mdname(mddev), rdev->bdev); 6241 queue_limits_cancel_update(mddev->gendisk->queue); 6242 return -ENXIO; 6243 } 6244 6245 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 6246} 6247EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 6248 6249/* update the optimal I/O size after a reshape */ 6250void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 6251{ 6252 struct queue_limits lim; 6253 6254 if (mddev_is_dm(mddev)) 6255 return; 6256 6257 /* don't bother updating io_opt if we can't suspend the array */ 6258 if (mddev_suspend(mddev, false) < 0) 6259 return; 6260 lim = queue_limits_start_update(mddev->gendisk->queue); 6261 lim.io_opt = lim.io_min * nr_stripes; 6262 queue_limits_commit_update(mddev->gendisk->queue, &lim); 6263 mddev_resume(mddev); 6264} 6265EXPORT_SYMBOL_GPL(mddev_update_io_opt); 6266 6267static void mddev_delayed_delete(struct work_struct *ws) 6268{ 6269 struct mddev *mddev = container_of(ws, struct mddev, del_work); 6270 6271 kobject_put(&mddev->kobj); 6272} 6273 6274void md_init_stacking_limits(struct queue_limits *lim) 6275{ 6276 blk_set_stacking_limits(lim); 6277 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 6278 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 6279} 6280EXPORT_SYMBOL_GPL(md_init_stacking_limits); 6281 6282struct mddev *md_alloc(dev_t dev, char *name) 6283{ 6284 /* 6285 * If dev is zero, name is the name of a device to allocate with 6286 * an arbitrary minor number. It will be "md_???" 6287 * If dev is non-zero it must be a device number with a MAJOR of 6288 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 6289 * the device is being created by opening a node in /dev. 6290 * If "name" is not NULL, the device is being created by 6291 * writing to /sys/module/md_mod/parameters/new_array. 6292 */ 6293 static DEFINE_MUTEX(disks_mutex); 6294 struct mddev *mddev; 6295 struct gendisk *disk; 6296 int partitioned; 6297 int shift; 6298 int unit; 6299 int error; 6300 6301 /* 6302 * Wait for any previous instance of this device to be completely 6303 * removed (mddev_delayed_delete). 6304 */ 6305 flush_workqueue(md_misc_wq); 6306 6307 mutex_lock(&disks_mutex); 6308 mddev = mddev_alloc(dev); 6309 if (IS_ERR(mddev)) { 6310 error = PTR_ERR(mddev); 6311 goto out_unlock; 6312 } 6313 6314 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 6315 shift = partitioned ? MdpMinorShift : 0; 6316 unit = MINOR(mddev->unit) >> shift; 6317 6318 if (name && !dev) { 6319 /* Need to ensure that 'name' is not a duplicate. 6320 */ 6321 struct mddev *mddev2; 6322 spin_lock(&all_mddevs_lock); 6323 6324 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 6325 if (mddev2->gendisk && 6326 strcmp(mddev2->gendisk->disk_name, name) == 0) { 6327 spin_unlock(&all_mddevs_lock); 6328 error = -EEXIST; 6329 goto out_free_mddev; 6330 } 6331 spin_unlock(&all_mddevs_lock); 6332 } 6333 if (name && dev) 6334 /* 6335 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 6336 */ 6337 mddev->hold_active = UNTIL_STOP; 6338 6339 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 6340 if (IS_ERR(disk)) { 6341 error = PTR_ERR(disk); 6342 goto out_free_mddev; 6343 } 6344 6345 disk->major = MAJOR(mddev->unit); 6346 disk->first_minor = unit << shift; 6347 disk->minors = 1 << shift; 6348 if (name) 6349 strcpy(disk->disk_name, name); 6350 else if (partitioned) 6351 sprintf(disk->disk_name, "md_d%d", unit); 6352 else 6353 sprintf(disk->disk_name, "md%d", unit); 6354 disk->fops = &md_fops; 6355 disk->private_data = mddev; 6356 6357 disk->events |= DISK_EVENT_MEDIA_CHANGE; 6358 mddev->gendisk = disk; 6359 error = add_disk(disk); 6360 if (error) 6361 goto out_put_disk; 6362 6363 kobject_init(&mddev->kobj, &md_ktype); 6364 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 6365 if (error) { 6366 /* 6367 * The disk is already live at this point. Clear the hold flag 6368 * and let mddev_put take care of the deletion, as it isn't any 6369 * different from a normal close on last release now. 6370 */ 6371 mddev->hold_active = 0; 6372 mutex_unlock(&disks_mutex); 6373 mddev_put(mddev); 6374 return ERR_PTR(error); 6375 } 6376 6377 kobject_uevent(&mddev->kobj, KOBJ_ADD); 6378 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 6379 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 6380 mutex_unlock(&disks_mutex); 6381 return mddev; 6382 6383out_put_disk: 6384 put_disk(disk); 6385out_free_mddev: 6386 mddev_free(mddev); 6387out_unlock: 6388 mutex_unlock(&disks_mutex); 6389 return ERR_PTR(error); 6390} 6391 6392static int md_alloc_and_put(dev_t dev, char *name) 6393{ 6394 struct mddev *mddev = md_alloc(dev, name); 6395 6396 if (legacy_async_del_gendisk) 6397 pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); 6398 6399 if (IS_ERR(mddev)) 6400 return PTR_ERR(mddev); 6401 mddev_put(mddev); 6402 return 0; 6403} 6404 6405static void md_probe(dev_t dev) 6406{ 6407 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 6408 return; 6409 if (create_on_open) 6410 md_alloc_and_put(dev, NULL); 6411} 6412 6413static int add_named_array(const char *val, const struct kernel_param *kp) 6414{ 6415 /* 6416 * val must be "md_*" or "mdNNN". 6417 * For "md_*" we allocate an array with a large free minor number, and 6418 * set the name to val. val must not already be an active name. 6419 * For "mdNNN" we allocate an array with the minor number NNN 6420 * which must not already be in use. 6421 */ 6422 int len = strlen(val); 6423 char buf[DISK_NAME_LEN]; 6424 unsigned long devnum; 6425 6426 while (len && val[len-1] == '\n') 6427 len--; 6428 if (len >= DISK_NAME_LEN) 6429 return -E2BIG; 6430 strscpy(buf, val, len+1); 6431 if (strncmp(buf, "md_", 3) == 0) 6432 return md_alloc_and_put(0, buf); 6433 if (strncmp(buf, "md", 2) == 0 && 6434 isdigit(buf[2]) && 6435 kstrtoul(buf+2, 10, &devnum) == 0 && 6436 devnum <= MINORMASK) 6437 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6438 6439 return -EINVAL; 6440} 6441 6442static void md_safemode_timeout(struct timer_list *t) 6443{ 6444 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); 6445 6446 mddev->safemode = 1; 6447 if (mddev->external) 6448 sysfs_notify_dirent_safe(mddev->sysfs_state); 6449 6450 md_wakeup_thread(mddev->thread); 6451} 6452 6453static int start_dirty_degraded; 6454 6455/* 6456 * Read bitmap superblock and return the bitmap_id based on disk version. 6457 * This is used as fallback when default bitmap version and on-disk version 6458 * doesn't match, and mdadm is not the latest version to set bitmap_type. 6459 */ 6460static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev) 6461{ 6462 struct md_rdev *rdev; 6463 struct page *sb_page; 6464 bitmap_super_t *sb; 6465 enum md_submodule_id id = ID_BITMAP_NONE; 6466 sector_t sector; 6467 u32 version; 6468 6469 if (!mddev->bitmap_info.offset) 6470 return ID_BITMAP_NONE; 6471 6472 sb_page = alloc_page(GFP_KERNEL); 6473 if (!sb_page) { 6474 pr_warn("md: %s: failed to allocate memory for bitmap\n", 6475 mdname(mddev)); 6476 return ID_BITMAP_NONE; 6477 } 6478 6479 sector = mddev->bitmap_info.offset; 6480 6481 rdev_for_each(rdev, mddev) { 6482 u32 iosize; 6483 6484 if (!test_bit(In_sync, &rdev->flags) || 6485 test_bit(Faulty, &rdev->flags) || 6486 test_bit(Bitmap_sync, &rdev->flags)) 6487 continue; 6488 6489 iosize = roundup(sizeof(bitmap_super_t), 6490 bdev_logical_block_size(rdev->bdev)); 6491 if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ, 6492 true)) 6493 goto read_ok; 6494 } 6495 pr_warn("md: %s: failed to read bitmap from any device\n", 6496 mdname(mddev)); 6497 goto out; 6498 6499read_ok: 6500 sb = kmap_local_page(sb_page); 6501 if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { 6502 pr_warn("md: %s: invalid bitmap magic 0x%x\n", 6503 mdname(mddev), le32_to_cpu(sb->magic)); 6504 goto out_unmap; 6505 } 6506 6507 version = le32_to_cpu(sb->version); 6508 switch (version) { 6509 case BITMAP_MAJOR_LO: 6510 case BITMAP_MAJOR_HI: 6511 case BITMAP_MAJOR_CLUSTERED: 6512 id = ID_BITMAP; 6513 break; 6514 case BITMAP_MAJOR_LOCKLESS: 6515 id = ID_LLBITMAP; 6516 break; 6517 default: 6518 pr_warn("md: %s: unknown bitmap version %u\n", 6519 mdname(mddev), version); 6520 break; 6521 } 6522 6523out_unmap: 6524 kunmap_local(sb); 6525out: 6526 __free_page(sb_page); 6527 return id; 6528} 6529 6530int md_bitmap_create_nosysfs(struct mddev *mddev) 6531{ 6532 enum md_submodule_id orig_id = mddev->bitmap_id; 6533 enum md_submodule_id sb_id; 6534 int err; 6535 6536 if (mddev->bitmap_id == ID_BITMAP_NONE) 6537 return -EINVAL; 6538 6539 if (!mddev_set_bitmap_ops_nosysfs(mddev)) { 6540 mddev->bitmap_id = orig_id; 6541 return -ENOENT; 6542 } 6543 6544 err = mddev->bitmap_ops->create(mddev); 6545 if (!err) 6546 return 0; 6547 6548 /* 6549 * Create failed, if default bitmap version and on-disk version 6550 * doesn't match, and mdadm is not the latest version to set 6551 * bitmap_type, set bitmap_ops based on the disk version. 6552 */ 6553 mddev->bitmap_ops = NULL; 6554 6555 sb_id = md_bitmap_get_id_from_sb(mddev); 6556 if (sb_id == ID_BITMAP_NONE || sb_id == orig_id) { 6557 mddev->bitmap_id = orig_id; 6558 return err; 6559 } 6560 6561 pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n", 6562 mdname(mddev), orig_id, sb_id); 6563 6564 mddev->bitmap_id = sb_id; 6565 if (!mddev_set_bitmap_ops_nosysfs(mddev)) { 6566 mddev->bitmap_id = orig_id; 6567 return -ENOENT; 6568 } 6569 6570 err = mddev->bitmap_ops->create(mddev); 6571 if (err) { 6572 mddev->bitmap_ops = NULL; 6573 mddev->bitmap_id = orig_id; 6574 } 6575 6576 return err; 6577} 6578 6579static int md_bitmap_create(struct mddev *mddev) 6580{ 6581 int err; 6582 6583 err = md_bitmap_create_nosysfs(mddev); 6584 if (err) 6585 return err; 6586 6587 if (!mddev_is_dm(mddev) && mddev->bitmap_ops->groups) 6588 md_bitmap_sysfs_add(mddev); 6589 6590 return 0; 6591} 6592 6593void md_bitmap_destroy_nosysfs(struct mddev *mddev) 6594{ 6595 if (!md_bitmap_registered(mddev)) 6596 return; 6597 6598 mddev->bitmap_ops->destroy(mddev); 6599 mddev->bitmap_ops = NULL; 6600} 6601 6602static void md_bitmap_destroy(struct mddev *mddev) 6603{ 6604 if (!mddev_is_dm(mddev) && mddev->bitmap_ops && 6605 mddev->bitmap_ops->groups) 6606 md_bitmap_sysfs_del(mddev); 6607 6608 md_bitmap_destroy_nosysfs(mddev); 6609} 6610 6611static void md_bitmap_set_none(struct mddev *mddev) 6612{ 6613 mddev->bitmap_id = ID_BITMAP_NONE; 6614 if (!mddev_set_bitmap_ops_nosysfs(mddev)) 6615 return; 6616 6617 if (!mddev_is_dm(mddev) && mddev->bitmap_ops->groups) 6618 md_bitmap_sysfs_add(mddev); 6619} 6620 6621int md_run(struct mddev *mddev) 6622{ 6623 int err; 6624 struct md_rdev *rdev; 6625 struct md_personality *pers; 6626 bool nowait = true; 6627 6628 if (list_empty(&mddev->disks)) 6629 /* cannot run an array with no devices.. */ 6630 return -EINVAL; 6631 6632 if (mddev->pers) 6633 return -EBUSY; 6634 /* Cannot run until previous stop completes properly */ 6635 if (mddev->sysfs_active) 6636 return -EBUSY; 6637 6638 /* 6639 * Analyze all RAID superblock(s) 6640 */ 6641 if (!mddev->raid_disks) { 6642 if (!mddev->persistent) 6643 return -EINVAL; 6644 err = analyze_sbs(mddev); 6645 if (err) 6646 return -EINVAL; 6647 } 6648 6649 if (mddev->level != LEVEL_NONE) 6650 request_module("md-level-%d", mddev->level); 6651 else if (mddev->clevel[0]) 6652 request_module("md-%s", mddev->clevel); 6653 6654 /* 6655 * Drop all container device buffers, from now on 6656 * the only valid external interface is through the md 6657 * device. 6658 */ 6659 clear_bit(MD_HAS_SUPERBLOCK, &mddev->flags); 6660 rdev_for_each(rdev, mddev) { 6661 if (test_bit(Faulty, &rdev->flags)) 6662 continue; 6663 sync_blockdev(rdev->bdev); 6664 invalidate_bdev(rdev->bdev); 6665 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6666 mddev->ro = MD_RDONLY; 6667 if (!mddev_is_dm(mddev)) 6668 set_disk_ro(mddev->gendisk, 1); 6669 } 6670 6671 if (rdev->sb_page) 6672 set_bit(MD_HAS_SUPERBLOCK, &mddev->flags); 6673 6674 /* perform some consistency tests on the device. 6675 * We don't want the data to overlap the metadata, 6676 * Internal Bitmap issues have been handled elsewhere. 6677 */ 6678 if (rdev->meta_bdev) { 6679 /* Nothing to check */; 6680 } else if (rdev->data_offset < rdev->sb_start) { 6681 if (mddev->dev_sectors && 6682 rdev->data_offset + mddev->dev_sectors 6683 > rdev->sb_start) { 6684 pr_warn("md: %s: data overlaps metadata\n", 6685 mdname(mddev)); 6686 return -EINVAL; 6687 } 6688 } else { 6689 if (rdev->sb_start + rdev->sb_size/512 6690 > rdev->data_offset) { 6691 pr_warn("md: %s: metadata overlaps data\n", 6692 mdname(mddev)); 6693 return -EINVAL; 6694 } 6695 } 6696 sysfs_notify_dirent_safe(rdev->sysfs_state); 6697 nowait = nowait && bdev_nowait(rdev->bdev); 6698 } 6699 6700 pers = get_pers(mddev->level, mddev->clevel); 6701 if (!pers) 6702 return -EINVAL; 6703 if (mddev->level != pers->head.id) { 6704 mddev->level = pers->head.id; 6705 mddev->new_level = pers->head.id; 6706 } 6707 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 6708 6709 if (mddev->reshape_position != MaxSector && 6710 pers->start_reshape == NULL) { 6711 /* This personality cannot handle reshaping... */ 6712 put_pers(pers); 6713 return -EINVAL; 6714 } 6715 6716 if (pers->sync_request) { 6717 /* Warn if this is a potentially silly 6718 * configuration. 6719 */ 6720 struct md_rdev *rdev2; 6721 int warned = 0; 6722 6723 rdev_for_each(rdev, mddev) 6724 rdev_for_each(rdev2, mddev) { 6725 if (rdev < rdev2 && 6726 rdev->bdev->bd_disk == 6727 rdev2->bdev->bd_disk) { 6728 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6729 mdname(mddev), 6730 rdev->bdev, 6731 rdev2->bdev); 6732 warned = 1; 6733 } 6734 } 6735 6736 if (warned) 6737 pr_warn("True protection against single-disk failure might be compromised.\n"); 6738 } 6739 6740 /* dm-raid expect sync_thread to be frozen until resume */ 6741 if (!mddev_is_dm(mddev)) 6742 mddev->recovery = 0; 6743 6744 /* may be over-ridden by personality */ 6745 mddev->resync_max_sectors = mddev->dev_sectors; 6746 6747 mddev->ok_start_degraded = start_dirty_degraded; 6748 6749 if (start_readonly && md_is_rdwr(mddev)) 6750 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6751 6752 err = pers->run(mddev); 6753 if (err) 6754 pr_warn("md: pers->run() failed ...\n"); 6755 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6756 WARN_ONCE(!mddev->external_size, 6757 "%s: default size too small, but 'external_size' not in effect?\n", 6758 __func__); 6759 pr_warn("md: invalid array_size %llu > default size %llu\n", 6760 (unsigned long long)mddev->array_sectors / 2, 6761 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6762 err = -EINVAL; 6763 } 6764 if (err == 0 && pers->sync_request && 6765 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6766 err = md_bitmap_create(mddev); 6767 if (err) 6768 pr_warn("%s: failed to create bitmap (%d)\n", 6769 mdname(mddev), err); 6770 } 6771 if (err) 6772 goto bitmap_abort; 6773 6774 if (mddev->bitmap_info.max_write_behind > 0) { 6775 bool create_pool = false; 6776 6777 rdev_for_each(rdev, mddev) { 6778 if (test_bit(WriteMostly, &rdev->flags) && 6779 rdev_init_serial(rdev)) 6780 create_pool = true; 6781 } 6782 if (create_pool && mddev->serial_info_pool == NULL) { 6783 mddev->serial_info_pool = 6784 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6785 sizeof(struct serial_info)); 6786 if (!mddev->serial_info_pool) { 6787 err = -ENOMEM; 6788 goto bitmap_abort; 6789 } 6790 } 6791 } 6792 6793 if (pers->sync_request) { 6794 if (mddev->kobj.sd && 6795 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6796 pr_warn("md: cannot register extra attributes for %s\n", 6797 mdname(mddev)); 6798 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6799 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6800 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6801 } else if (mddev->ro == MD_AUTO_READ) 6802 mddev->ro = MD_RDWR; 6803 6804 atomic_set(&mddev->max_corr_read_errors, 6805 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6806 mddev->safemode = 0; 6807 if (mddev_is_clustered(mddev)) 6808 mddev->safemode_delay = 0; 6809 else 6810 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6811 mddev->in_sync = 1; 6812 smp_wmb(); 6813 spin_lock(&mddev->lock); 6814 mddev->pers = pers; 6815 spin_unlock(&mddev->lock); 6816 rdev_for_each(rdev, mddev) 6817 if (rdev->raid_disk >= 0) 6818 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6819 6820 if (mddev->degraded && md_is_rdwr(mddev)) 6821 /* This ensures that recovering status is reported immediately 6822 * via sysfs - until a lack of spares is confirmed. 6823 */ 6824 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6825 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6826 6827 if (mddev->sb_flags) 6828 md_update_sb(mddev, 0); 6829 6830 if (IS_ENABLED(CONFIG_MD_BITMAP) && !mddev->bitmap_info.file && 6831 !mddev->bitmap_info.offset) 6832 md_bitmap_set_none(mddev); 6833 6834 md_new_event(); 6835 return 0; 6836 6837bitmap_abort: 6838 mddev_detach(mddev); 6839 if (mddev->private) 6840 pers->free(mddev, mddev->private); 6841 mddev->private = NULL; 6842 put_pers(pers); 6843 md_bitmap_destroy(mddev); 6844 return err; 6845} 6846EXPORT_SYMBOL_GPL(md_run); 6847 6848int do_md_run(struct mddev *mddev) 6849{ 6850 int err; 6851 6852 set_bit(MD_NOT_READY, &mddev->flags); 6853 err = md_run(mddev); 6854 if (err) 6855 goto out; 6856 6857 if (md_bitmap_registered(mddev)) { 6858 err = mddev->bitmap_ops->load(mddev); 6859 if (err) { 6860 md_bitmap_destroy(mddev); 6861 goto out; 6862 } 6863 } 6864 6865 if (mddev_is_clustered(mddev)) 6866 md_allow_write(mddev); 6867 6868 /* run start up tasks that require md_thread */ 6869 md_start(mddev); 6870 6871 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6872 6873 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6874 clear_bit(MD_NOT_READY, &mddev->flags); 6875 mddev->changed = 1; 6876 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6877 sysfs_notify_dirent_safe(mddev->sysfs_state); 6878 sysfs_notify_dirent_safe(mddev->sysfs_action); 6879 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6880out: 6881 clear_bit(MD_NOT_READY, &mddev->flags); 6882 return err; 6883} 6884 6885int md_start(struct mddev *mddev) 6886{ 6887 int ret = 0; 6888 6889 if (mddev->pers->start) { 6890 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6891 ret = mddev->pers->start(mddev); 6892 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6893 md_wakeup_thread(mddev->sync_thread); 6894 } 6895 return ret; 6896} 6897EXPORT_SYMBOL_GPL(md_start); 6898 6899static int restart_array(struct mddev *mddev) 6900{ 6901 struct gendisk *disk = mddev->gendisk; 6902 struct md_rdev *rdev; 6903 bool has_journal = false; 6904 bool has_readonly = false; 6905 6906 /* Complain if it has no devices */ 6907 if (list_empty(&mddev->disks)) 6908 return -ENXIO; 6909 if (!mddev->pers) 6910 return -EINVAL; 6911 if (md_is_rdwr(mddev)) 6912 return -EBUSY; 6913 6914 rcu_read_lock(); 6915 rdev_for_each_rcu(rdev, mddev) { 6916 if (test_bit(Journal, &rdev->flags) && 6917 !test_bit(Faulty, &rdev->flags)) 6918 has_journal = true; 6919 if (rdev_read_only(rdev)) 6920 has_readonly = true; 6921 } 6922 rcu_read_unlock(); 6923 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6924 /* Don't restart rw with journal missing/faulty */ 6925 return -EINVAL; 6926 if (has_readonly) 6927 return -EROFS; 6928 6929 mddev->safemode = 0; 6930 mddev->ro = MD_RDWR; 6931 set_disk_ro(disk, 0); 6932 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6933 /* Kick recovery or resync if necessary */ 6934 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6935 md_wakeup_thread(mddev->sync_thread); 6936 sysfs_notify_dirent_safe(mddev->sysfs_state); 6937 return 0; 6938} 6939 6940static void md_clean(struct mddev *mddev) 6941{ 6942 mddev->array_sectors = 0; 6943 mddev->external_size = 0; 6944 mddev->dev_sectors = 0; 6945 mddev->raid_disks = 0; 6946 mddev->resync_offset = 0; 6947 mddev->resync_min = 0; 6948 mddev->resync_max = MaxSector; 6949 mddev->reshape_position = MaxSector; 6950 /* we still need mddev->external in export_rdev, do not clear it yet */ 6951 mddev->persistent = 0; 6952 mddev->level = LEVEL_NONE; 6953 mddev->clevel[0] = 0; 6954 6955 /* 6956 * For legacy_async_del_gendisk mode, it can stop the array in the 6957 * middle of assembling it, then it still can access the array. So 6958 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, 6959 * it can't open the array again after stopping it. So it doesn't 6960 * clear MD_CLOSING. 6961 */ 6962 if (legacy_async_del_gendisk && mddev->hold_active) { 6963 clear_bit(MD_CLOSING, &mddev->flags); 6964 } else { 6965 /* if UNTIL_STOP is set, it's cleared here */ 6966 mddev->hold_active = 0; 6967 /* Don't clear MD_CLOSING, or mddev can be opened again. */ 6968 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6969 } 6970 mddev->sb_flags = 0; 6971 mddev->ro = MD_RDWR; 6972 mddev->metadata_type[0] = 0; 6973 mddev->chunk_sectors = 0; 6974 mddev->ctime = mddev->utime = 0; 6975 mddev->layout = 0; 6976 mddev->logical_block_size = 0; 6977 mddev->max_disks = 0; 6978 mddev->events = 0; 6979 mddev->can_decrease_events = 0; 6980 mddev->delta_disks = 0; 6981 mddev->reshape_backwards = 0; 6982 mddev->new_level = LEVEL_NONE; 6983 mddev->new_layout = 0; 6984 mddev->new_chunk_sectors = 0; 6985 mddev->curr_resync = MD_RESYNC_NONE; 6986 atomic64_set(&mddev->resync_mismatches, 0); 6987 mddev->suspend_lo = mddev->suspend_hi = 0; 6988 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6989 mddev->recovery = 0; 6990 mddev->in_sync = 0; 6991 mddev->changed = 0; 6992 mddev->degraded = 0; 6993 mddev->safemode = 0; 6994 mddev->private = NULL; 6995 mddev->cluster_info = NULL; 6996 mddev->bitmap_info.offset = 0; 6997 mddev->bitmap_info.default_offset = 0; 6998 mddev->bitmap_info.default_space = 0; 6999 mddev->bitmap_info.chunksize = 0; 7000 mddev->bitmap_info.daemon_sleep = 0; 7001 mddev->bitmap_info.max_write_behind = 0; 7002 mddev->bitmap_info.nodes = 0; 7003} 7004 7005static void __md_stop_writes(struct mddev *mddev) 7006{ 7007 timer_delete_sync(&mddev->safemode_timer); 7008 7009 if (md_is_rdwr(mddev) || !mddev_is_dm(mddev)) { 7010 if (mddev->pers && mddev->pers->quiesce) { 7011 mddev->pers->quiesce(mddev, 1); 7012 mddev->pers->quiesce(mddev, 0); 7013 } 7014 7015 if (md_bitmap_enabled(mddev, true)) 7016 mddev->bitmap_ops->flush(mddev); 7017 } 7018 7019 if (md_is_rdwr(mddev) && 7020 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 7021 mddev->sb_flags)) { 7022 /* mark array as shutdown cleanly */ 7023 if (!mddev_is_clustered(mddev)) 7024 mddev->in_sync = 1; 7025 md_update_sb(mddev, 1); 7026 } 7027 /* disable policy to guarantee rdevs free resources for serialization */ 7028 clear_bit(MD_SERIALIZE_POLICY, &mddev->flags); 7029 mddev_destroy_serial_pool(mddev, NULL); 7030} 7031 7032void md_stop_writes(struct mddev *mddev) 7033{ 7034 mddev_lock_nointr(mddev); 7035 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 7036 stop_sync_thread(mddev, true); 7037 __md_stop_writes(mddev); 7038 mddev_unlock(mddev); 7039} 7040EXPORT_SYMBOL_GPL(md_stop_writes); 7041 7042static void mddev_detach(struct mddev *mddev) 7043{ 7044 if (md_bitmap_enabled(mddev, false)) 7045 mddev->bitmap_ops->wait_behind_writes(mddev); 7046 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 7047 mddev->pers->quiesce(mddev, 1); 7048 mddev->pers->quiesce(mddev, 0); 7049 } 7050 md_unregister_thread(mddev, &mddev->thread); 7051 7052 /* the unplug fn references 'conf' */ 7053 if (!mddev_is_dm(mddev)) 7054 blk_sync_queue(mddev->gendisk->queue); 7055} 7056 7057static void __md_stop(struct mddev *mddev) 7058{ 7059 struct md_personality *pers = mddev->pers; 7060 7061 md_bitmap_destroy(mddev); 7062 mddev_detach(mddev); 7063 spin_lock(&mddev->lock); 7064 mddev->pers = NULL; 7065 spin_unlock(&mddev->lock); 7066 if (mddev->private) 7067 pers->free(mddev, mddev->private); 7068 mddev->private = NULL; 7069 put_pers(pers); 7070 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 7071} 7072 7073void md_stop(struct mddev *mddev) 7074{ 7075 lockdep_assert_held(&mddev->reconfig_mutex); 7076 7077 /* stop the array and free an attached data structures. 7078 * This is called from dm-raid 7079 */ 7080 __md_stop_writes(mddev); 7081 __md_stop(mddev); 7082} 7083 7084EXPORT_SYMBOL_GPL(md_stop); 7085 7086/* ensure 'mddev->pers' exist before calling md_set_readonly() */ 7087static int md_set_readonly(struct mddev *mddev) 7088{ 7089 int err = 0; 7090 int did_freeze = 0; 7091 7092 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 7093 return -EBUSY; 7094 7095 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 7096 did_freeze = 1; 7097 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 7098 } 7099 7100 stop_sync_thread(mddev, false); 7101 wait_event(mddev->sb_wait, 7102 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7103 mddev_lock_nointr(mddev); 7104 7105 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 7106 pr_warn("md: %s still in use.\n",mdname(mddev)); 7107 err = -EBUSY; 7108 goto out; 7109 } 7110 7111 __md_stop_writes(mddev); 7112 7113 if (mddev->ro == MD_RDONLY) { 7114 err = -ENXIO; 7115 goto out; 7116 } 7117 7118 mddev->ro = MD_RDONLY; 7119 set_disk_ro(mddev->gendisk, 1); 7120 7121out: 7122 if (!err || did_freeze) { 7123 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 7124 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7125 sysfs_notify_dirent_safe(mddev->sysfs_state); 7126 } 7127 7128 return err; 7129} 7130 7131/* mode: 7132 * 0 - completely stop and dis-assemble array 7133 * 2 - stop but do not disassemble array 7134 */ 7135static int do_md_stop(struct mddev *mddev, int mode) 7136{ 7137 struct gendisk *disk = mddev->gendisk; 7138 struct md_rdev *rdev; 7139 int did_freeze = 0; 7140 7141 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 7142 did_freeze = 1; 7143 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 7144 } 7145 7146 stop_sync_thread(mddev, true); 7147 7148 if (mddev->sysfs_active || 7149 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 7150 pr_warn("md: %s still in use.\n",mdname(mddev)); 7151 if (did_freeze) { 7152 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 7153 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7154 } 7155 return -EBUSY; 7156 } 7157 if (mddev->pers) { 7158 if (!md_is_rdwr(mddev)) 7159 set_disk_ro(disk, 0); 7160 7161 if (mode == 2 && mddev->pers->sync_request && 7162 mddev->to_remove == NULL) 7163 mddev->to_remove = &md_redundancy_group; 7164 7165 __md_stop_writes(mddev); 7166 __md_stop(mddev); 7167 7168 /* tell userspace to handle 'inactive' */ 7169 sysfs_notify_dirent_safe(mddev->sysfs_state); 7170 7171 rdev_for_each(rdev, mddev) 7172 if (rdev->raid_disk >= 0) 7173 sysfs_unlink_rdev(mddev, rdev); 7174 7175 set_capacity_and_notify(disk, 0); 7176 mddev->changed = 1; 7177 7178 if (!md_is_rdwr(mddev)) 7179 mddev->ro = MD_RDWR; 7180 } 7181 /* 7182 * Free resources if final stop 7183 */ 7184 if (mode == 0) { 7185 pr_info("md: %s stopped.\n", mdname(mddev)); 7186 7187 if (mddev->bitmap_info.file) { 7188 struct file *f = mddev->bitmap_info.file; 7189 spin_lock(&mddev->lock); 7190 mddev->bitmap_info.file = NULL; 7191 spin_unlock(&mddev->lock); 7192 fput(f); 7193 } 7194 mddev->bitmap_info.offset = 0; 7195 7196 export_array(mddev); 7197 md_clean(mddev); 7198 if (!legacy_async_del_gendisk) 7199 set_bit(MD_DELETED, &mddev->flags); 7200 } 7201 md_new_event(); 7202 sysfs_notify_dirent_safe(mddev->sysfs_state); 7203 return 0; 7204} 7205 7206#ifndef MODULE 7207static void autorun_array(struct mddev *mddev) 7208{ 7209 struct md_rdev *rdev; 7210 int err; 7211 7212 if (list_empty(&mddev->disks)) 7213 return; 7214 7215 pr_info("md: running: "); 7216 7217 rdev_for_each(rdev, mddev) { 7218 pr_cont("<%pg>", rdev->bdev); 7219 } 7220 pr_cont("\n"); 7221 7222 err = do_md_run(mddev); 7223 if (err) { 7224 pr_warn("md: do_md_run() returned %d\n", err); 7225 do_md_stop(mddev, 0); 7226 } 7227} 7228 7229/* 7230 * lets try to run arrays based on all disks that have arrived 7231 * until now. (those are in pending_raid_disks) 7232 * 7233 * the method: pick the first pending disk, collect all disks with 7234 * the same UUID, remove all from the pending list and put them into 7235 * the 'same_array' list. Then order this list based on superblock 7236 * update time (freshest comes first), kick out 'old' disks and 7237 * compare superblocks. If everything's fine then run it. 7238 * 7239 * If "unit" is allocated, then bump its reference count 7240 */ 7241static void autorun_devices(int part) 7242{ 7243 struct md_rdev *rdev0, *rdev, *tmp; 7244 struct mddev *mddev; 7245 7246 pr_info("md: autorun ...\n"); 7247 while (!list_empty(&pending_raid_disks)) { 7248 int unit; 7249 dev_t dev; 7250 LIST_HEAD(candidates); 7251 rdev0 = list_entry(pending_raid_disks.next, 7252 struct md_rdev, same_set); 7253 7254 pr_debug("md: considering %pg ...\n", rdev0->bdev); 7255 INIT_LIST_HEAD(&candidates); 7256 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 7257 if (super_90_load(rdev, rdev0, 0) >= 0) { 7258 pr_debug("md: adding %pg ...\n", 7259 rdev->bdev); 7260 list_move(&rdev->same_set, &candidates); 7261 } 7262 /* 7263 * now we have a set of devices, with all of them having 7264 * mostly sane superblocks. It's time to allocate the 7265 * mddev. 7266 */ 7267 if (part) { 7268 dev = MKDEV(mdp_major, 7269 rdev0->preferred_minor << MdpMinorShift); 7270 unit = MINOR(dev) >> MdpMinorShift; 7271 } else { 7272 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 7273 unit = MINOR(dev); 7274 } 7275 if (rdev0->preferred_minor != unit) { 7276 pr_warn("md: unit number in %pg is bad: %d\n", 7277 rdev0->bdev, rdev0->preferred_minor); 7278 break; 7279 } 7280 7281 mddev = md_alloc(dev, NULL); 7282 if (IS_ERR(mddev)) 7283 break; 7284 7285 if (mddev_suspend_and_lock(mddev)) 7286 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 7287 else if (mddev->raid_disks || mddev->major_version 7288 || !list_empty(&mddev->disks)) { 7289 pr_warn("md: %s already running, cannot run %pg\n", 7290 mdname(mddev), rdev0->bdev); 7291 mddev_unlock_and_resume(mddev); 7292 } else { 7293 pr_debug("md: created %s\n", mdname(mddev)); 7294 mddev->persistent = 1; 7295 rdev_for_each_list(rdev, tmp, &candidates) { 7296 list_del_init(&rdev->same_set); 7297 if (bind_rdev_to_array(rdev, mddev)) 7298 export_rdev(rdev); 7299 } 7300 autorun_array(mddev); 7301 mddev_unlock_and_resume(mddev); 7302 } 7303 /* on success, candidates will be empty, on error 7304 * it won't... 7305 */ 7306 rdev_for_each_list(rdev, tmp, &candidates) { 7307 list_del_init(&rdev->same_set); 7308 export_rdev(rdev); 7309 } 7310 mddev_put(mddev); 7311 } 7312 pr_info("md: ... autorun DONE.\n"); 7313} 7314#endif /* !MODULE */ 7315 7316static int get_version(void __user *arg) 7317{ 7318 mdu_version_t ver; 7319 7320 ver.major = MD_MAJOR_VERSION; 7321 ver.minor = MD_MINOR_VERSION; 7322 ver.patchlevel = MD_PATCHLEVEL_VERSION; 7323 7324 if (copy_to_user(arg, &ver, sizeof(ver))) 7325 return -EFAULT; 7326 7327 return 0; 7328} 7329 7330static int get_array_info(struct mddev *mddev, void __user *arg) 7331{ 7332 mdu_array_info_t info; 7333 int nr,working,insync,failed,spare; 7334 struct md_rdev *rdev; 7335 7336 nr = working = insync = failed = spare = 0; 7337 rcu_read_lock(); 7338 rdev_for_each_rcu(rdev, mddev) { 7339 nr++; 7340 if (test_bit(Faulty, &rdev->flags)) 7341 failed++; 7342 else { 7343 working++; 7344 if (test_bit(In_sync, &rdev->flags)) 7345 insync++; 7346 else if (test_bit(Journal, &rdev->flags)) 7347 /* TODO: add journal count to md_u.h */ 7348 ; 7349 else 7350 spare++; 7351 } 7352 } 7353 rcu_read_unlock(); 7354 7355 info.major_version = mddev->major_version; 7356 info.minor_version = mddev->minor_version; 7357 info.patch_version = MD_PATCHLEVEL_VERSION; 7358 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 7359 info.level = mddev->level; 7360 info.size = mddev->dev_sectors / 2; 7361 if (info.size != mddev->dev_sectors / 2) /* overflow */ 7362 info.size = -1; 7363 info.nr_disks = nr; 7364 info.raid_disks = mddev->raid_disks; 7365 info.md_minor = mddev->md_minor; 7366 info.not_persistent= !mddev->persistent; 7367 7368 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 7369 info.state = 0; 7370 if (mddev->in_sync) 7371 info.state = (1<<MD_SB_CLEAN); 7372 if (mddev->bitmap && mddev->bitmap_info.offset) 7373 info.state |= (1<<MD_SB_BITMAP_PRESENT); 7374 if (mddev_is_clustered(mddev)) 7375 info.state |= (1<<MD_SB_CLUSTERED); 7376 info.active_disks = insync; 7377 info.working_disks = working; 7378 info.failed_disks = failed; 7379 info.spare_disks = spare; 7380 7381 info.layout = mddev->layout; 7382 info.chunk_size = mddev->chunk_sectors << 9; 7383 7384 if (copy_to_user(arg, &info, sizeof(info))) 7385 return -EFAULT; 7386 7387 return 0; 7388} 7389 7390static int get_bitmap_file(struct mddev *mddev, void __user * arg) 7391{ 7392 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 7393 char *ptr; 7394 int err; 7395 7396 file = kzalloc_obj(*file, GFP_NOIO); 7397 if (!file) 7398 return -ENOMEM; 7399 7400 err = 0; 7401 spin_lock(&mddev->lock); 7402 /* bitmap enabled */ 7403 if (mddev->bitmap_info.file) { 7404 ptr = file_path(mddev->bitmap_info.file, file->pathname, 7405 sizeof(file->pathname)); 7406 if (IS_ERR(ptr)) 7407 err = PTR_ERR(ptr); 7408 else 7409 memmove(file->pathname, ptr, 7410 sizeof(file->pathname)-(ptr-file->pathname)); 7411 } 7412 spin_unlock(&mddev->lock); 7413 7414 if (err == 0 && 7415 copy_to_user(arg, file, sizeof(*file))) 7416 err = -EFAULT; 7417 7418 kfree(file); 7419 return err; 7420} 7421 7422static int get_disk_info(struct mddev *mddev, void __user * arg) 7423{ 7424 mdu_disk_info_t info; 7425 struct md_rdev *rdev; 7426 7427 if (copy_from_user(&info, arg, sizeof(info))) 7428 return -EFAULT; 7429 7430 rcu_read_lock(); 7431 rdev = md_find_rdev_nr_rcu(mddev, info.number); 7432 if (rdev) { 7433 info.major = MAJOR(rdev->bdev->bd_dev); 7434 info.minor = MINOR(rdev->bdev->bd_dev); 7435 info.raid_disk = rdev->raid_disk; 7436 info.state = 0; 7437 if (test_bit(Faulty, &rdev->flags)) 7438 info.state |= (1<<MD_DISK_FAULTY); 7439 else if (test_bit(In_sync, &rdev->flags)) { 7440 info.state |= (1<<MD_DISK_ACTIVE); 7441 info.state |= (1<<MD_DISK_SYNC); 7442 } 7443 if (test_bit(Journal, &rdev->flags)) 7444 info.state |= (1<<MD_DISK_JOURNAL); 7445 if (test_bit(WriteMostly, &rdev->flags)) 7446 info.state |= (1<<MD_DISK_WRITEMOSTLY); 7447 if (test_bit(FailFast, &rdev->flags)) 7448 info.state |= (1<<MD_DISK_FAILFAST); 7449 } else { 7450 info.major = info.minor = 0; 7451 info.raid_disk = -1; 7452 info.state = (1<<MD_DISK_REMOVED); 7453 } 7454 rcu_read_unlock(); 7455 7456 if (copy_to_user(arg, &info, sizeof(info))) 7457 return -EFAULT; 7458 7459 return 0; 7460} 7461 7462int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 7463{ 7464 struct md_rdev *rdev; 7465 dev_t dev = MKDEV(info->major,info->minor); 7466 7467 if (mddev_is_clustered(mddev) && 7468 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 7469 pr_warn("%s: Cannot add to clustered mddev.\n", 7470 mdname(mddev)); 7471 return -EINVAL; 7472 } 7473 7474 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 7475 return -EOVERFLOW; 7476 7477 if (!mddev->raid_disks) { 7478 int err; 7479 /* expecting a device which has a superblock */ 7480 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 7481 if (IS_ERR(rdev)) { 7482 pr_warn("md: md_import_device returned %ld\n", 7483 PTR_ERR(rdev)); 7484 return PTR_ERR(rdev); 7485 } 7486 if (!list_empty(&mddev->disks)) { 7487 struct md_rdev *rdev0 7488 = list_entry(mddev->disks.next, 7489 struct md_rdev, same_set); 7490 err = super_types[mddev->major_version] 7491 .load_super(rdev, rdev0, mddev->minor_version); 7492 if (err < 0) { 7493 pr_warn("md: %pg has different UUID to %pg\n", 7494 rdev->bdev, 7495 rdev0->bdev); 7496 export_rdev(rdev); 7497 return -EINVAL; 7498 } 7499 } 7500 err = bind_rdev_to_array(rdev, mddev); 7501 if (err) 7502 export_rdev(rdev); 7503 return err; 7504 } 7505 7506 /* 7507 * md_add_new_disk can be used once the array is assembled 7508 * to add "hot spares". They must already have a superblock 7509 * written 7510 */ 7511 if (mddev->pers) { 7512 int err; 7513 if (!mddev->pers->hot_add_disk) { 7514 pr_warn("%s: personality does not support diskops!\n", 7515 mdname(mddev)); 7516 return -EINVAL; 7517 } 7518 if (mddev->persistent) 7519 rdev = md_import_device(dev, mddev->major_version, 7520 mddev->minor_version); 7521 else 7522 rdev = md_import_device(dev, -1, -1); 7523 if (IS_ERR(rdev)) { 7524 pr_warn("md: md_import_device returned %ld\n", 7525 PTR_ERR(rdev)); 7526 return PTR_ERR(rdev); 7527 } 7528 /* set saved_raid_disk if appropriate */ 7529 if (!mddev->persistent) { 7530 if (info->state & (1<<MD_DISK_SYNC) && 7531 info->raid_disk < mddev->raid_disks) { 7532 rdev->raid_disk = info->raid_disk; 7533 clear_bit(Bitmap_sync, &rdev->flags); 7534 } else 7535 rdev->raid_disk = -1; 7536 rdev->saved_raid_disk = rdev->raid_disk; 7537 } else 7538 super_types[mddev->major_version]. 7539 validate_super(mddev, NULL/*freshest*/, rdev); 7540 if ((info->state & (1<<MD_DISK_SYNC)) && 7541 rdev->raid_disk != info->raid_disk) { 7542 /* This was a hot-add request, but events doesn't 7543 * match, so reject it. 7544 */ 7545 export_rdev(rdev); 7546 return -EINVAL; 7547 } 7548 7549 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 7550 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7551 set_bit(WriteMostly, &rdev->flags); 7552 else 7553 clear_bit(WriteMostly, &rdev->flags); 7554 if (info->state & (1<<MD_DISK_FAILFAST)) 7555 set_bit(FailFast, &rdev->flags); 7556 else 7557 clear_bit(FailFast, &rdev->flags); 7558 7559 if (info->state & (1<<MD_DISK_JOURNAL)) { 7560 struct md_rdev *rdev2; 7561 bool has_journal = false; 7562 7563 /* make sure no existing journal disk */ 7564 rdev_for_each(rdev2, mddev) { 7565 if (test_bit(Journal, &rdev2->flags)) { 7566 has_journal = true; 7567 break; 7568 } 7569 } 7570 if (has_journal || mddev->bitmap) { 7571 export_rdev(rdev); 7572 return -EBUSY; 7573 } 7574 set_bit(Journal, &rdev->flags); 7575 } 7576 /* 7577 * check whether the device shows up in other nodes 7578 */ 7579 if (mddev_is_clustered(mddev)) { 7580 if (info->state & (1 << MD_DISK_CANDIDATE)) 7581 set_bit(Candidate, &rdev->flags); 7582 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7583 /* --add initiated by this node */ 7584 err = mddev->cluster_ops->add_new_disk(mddev, rdev); 7585 if (err) { 7586 export_rdev(rdev); 7587 return err; 7588 } 7589 } 7590 } 7591 7592 rdev->raid_disk = -1; 7593 err = bind_rdev_to_array(rdev, mddev); 7594 7595 if (err) 7596 export_rdev(rdev); 7597 7598 if (mddev_is_clustered(mddev)) { 7599 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7600 if (!err) { 7601 err = mddev->cluster_ops->new_disk_ack( 7602 mddev, err == 0); 7603 if (err) 7604 md_kick_rdev_from_array(rdev); 7605 } 7606 } else { 7607 if (err) 7608 mddev->cluster_ops->add_new_disk_cancel(mddev); 7609 else 7610 err = add_bound_rdev(rdev); 7611 } 7612 7613 } else if (!err) 7614 err = add_bound_rdev(rdev); 7615 7616 return err; 7617 } 7618 7619 /* otherwise, md_add_new_disk is only allowed 7620 * for major_version==0 superblocks 7621 */ 7622 if (mddev->major_version != 0) { 7623 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7624 return -EINVAL; 7625 } 7626 7627 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7628 int err; 7629 rdev = md_import_device(dev, -1, 0); 7630 if (IS_ERR(rdev)) { 7631 pr_warn("md: error, md_import_device() returned %ld\n", 7632 PTR_ERR(rdev)); 7633 return PTR_ERR(rdev); 7634 } 7635 rdev->desc_nr = info->number; 7636 if (info->raid_disk < mddev->raid_disks) 7637 rdev->raid_disk = info->raid_disk; 7638 else 7639 rdev->raid_disk = -1; 7640 7641 if (rdev->raid_disk < mddev->raid_disks) 7642 if (info->state & (1<<MD_DISK_SYNC)) 7643 set_bit(In_sync, &rdev->flags); 7644 7645 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7646 set_bit(WriteMostly, &rdev->flags); 7647 if (info->state & (1<<MD_DISK_FAILFAST)) 7648 set_bit(FailFast, &rdev->flags); 7649 7650 if (!mddev->persistent) { 7651 pr_debug("md: nonpersistent superblock ...\n"); 7652 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7653 } else 7654 rdev->sb_start = calc_dev_sboffset(rdev); 7655 rdev->sectors = rdev->sb_start; 7656 7657 err = bind_rdev_to_array(rdev, mddev); 7658 if (err) { 7659 export_rdev(rdev); 7660 return err; 7661 } 7662 } 7663 7664 return 0; 7665} 7666 7667static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7668{ 7669 struct md_rdev *rdev; 7670 7671 if (!mddev->pers) 7672 return -ENODEV; 7673 7674 rdev = find_rdev(mddev, dev); 7675 if (!rdev) 7676 return -ENXIO; 7677 7678 if (rdev->raid_disk < 0) 7679 goto kick_rdev; 7680 7681 clear_bit(Blocked, &rdev->flags); 7682 remove_and_add_spares(mddev, rdev); 7683 7684 if (rdev->raid_disk >= 0) 7685 goto busy; 7686 7687kick_rdev: 7688 if (mddev_is_clustered(mddev) && 7689 mddev->cluster_ops->remove_disk(mddev, rdev)) 7690 goto busy; 7691 7692 md_kick_rdev_from_array(rdev); 7693 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7694 if (!mddev->thread) 7695 md_update_sb(mddev, 1); 7696 md_new_event(); 7697 7698 return 0; 7699busy: 7700 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7701 rdev->bdev, mdname(mddev)); 7702 return -EBUSY; 7703} 7704 7705static int hot_add_disk(struct mddev *mddev, dev_t dev) 7706{ 7707 int err; 7708 struct md_rdev *rdev; 7709 7710 if (!mddev->pers) 7711 return -ENODEV; 7712 7713 if (mddev->major_version != 0) { 7714 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7715 mdname(mddev)); 7716 return -EINVAL; 7717 } 7718 if (!mddev->pers->hot_add_disk) { 7719 pr_warn("%s: personality does not support diskops!\n", 7720 mdname(mddev)); 7721 return -EINVAL; 7722 } 7723 7724 rdev = md_import_device(dev, -1, 0); 7725 if (IS_ERR(rdev)) { 7726 pr_warn("md: error, md_import_device() returned %ld\n", 7727 PTR_ERR(rdev)); 7728 return -EINVAL; 7729 } 7730 7731 if (mddev->persistent) 7732 rdev->sb_start = calc_dev_sboffset(rdev); 7733 else 7734 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7735 7736 rdev->sectors = rdev->sb_start; 7737 7738 if (test_bit(Faulty, &rdev->flags)) { 7739 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7740 rdev->bdev, mdname(mddev)); 7741 err = -EINVAL; 7742 goto abort_export; 7743 } 7744 7745 clear_bit(In_sync, &rdev->flags); 7746 rdev->desc_nr = -1; 7747 rdev->saved_raid_disk = -1; 7748 err = bind_rdev_to_array(rdev, mddev); 7749 if (err) 7750 goto abort_export; 7751 7752 /* 7753 * The rest should better be atomic, we can have disk failures 7754 * noticed in interrupt contexts ... 7755 */ 7756 7757 rdev->raid_disk = -1; 7758 7759 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7760 if (!mddev->thread) 7761 md_update_sb(mddev, 1); 7762 /* 7763 * Kick recovery, maybe this spare has to be added to the 7764 * array immediately. 7765 */ 7766 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7767 md_new_event(); 7768 return 0; 7769 7770abort_export: 7771 export_rdev(rdev); 7772 return err; 7773} 7774 7775static int set_bitmap_file(struct mddev *mddev, int fd) 7776{ 7777 int err = 0; 7778 7779 if (!md_bitmap_registered(mddev) || 7780 mddev->bitmap_id == ID_BITMAP_NONE) 7781 return -EINVAL; 7782 7783 if (mddev->pers) { 7784 if (!mddev->pers->quiesce || !mddev->thread) 7785 return -EBUSY; 7786 if (mddev->recovery || mddev->sync_thread) 7787 return -EBUSY; 7788 /* we should be able to change the bitmap.. */ 7789 } 7790 7791 if (fd >= 0) { 7792 struct inode *inode; 7793 struct file *f; 7794 7795 if (mddev->bitmap || mddev->bitmap_info.file) 7796 return -EEXIST; /* cannot add when bitmap is present */ 7797 7798 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7799 pr_warn("%s: bitmap files not supported by this kernel\n", 7800 mdname(mddev)); 7801 return -EINVAL; 7802 } 7803 pr_warn("%s: using deprecated bitmap file support\n", 7804 mdname(mddev)); 7805 7806 f = fget(fd); 7807 7808 if (f == NULL) { 7809 pr_warn("%s: error: failed to get bitmap file\n", 7810 mdname(mddev)); 7811 return -EBADF; 7812 } 7813 7814 inode = f->f_mapping->host; 7815 if (!S_ISREG(inode->i_mode)) { 7816 pr_warn("%s: error: bitmap file must be a regular file\n", 7817 mdname(mddev)); 7818 err = -EBADF; 7819 } else if (!(f->f_mode & FMODE_WRITE)) { 7820 pr_warn("%s: error: bitmap file must open for write\n", 7821 mdname(mddev)); 7822 err = -EBADF; 7823 } else if (atomic_read(&inode->i_writecount) != 1) { 7824 pr_warn("%s: error: bitmap file is already in use\n", 7825 mdname(mddev)); 7826 err = -EBUSY; 7827 } 7828 if (err) { 7829 fput(f); 7830 return err; 7831 } 7832 mddev->bitmap_info.file = f; 7833 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7834 } else if (mddev->bitmap == NULL) 7835 return -ENOENT; /* cannot remove what isn't there */ 7836 err = 0; 7837 if (mddev->pers) { 7838 if (fd >= 0) { 7839 err = md_bitmap_create(mddev); 7840 if (!err) 7841 err = mddev->bitmap_ops->load(mddev); 7842 7843 if (err) { 7844 md_bitmap_destroy(mddev); 7845 md_bitmap_set_none(mddev); 7846 fd = -1; 7847 } 7848 } else if (fd < 0) { 7849 md_bitmap_destroy(mddev); 7850 md_bitmap_set_none(mddev); 7851 } 7852 } 7853 7854 if (fd < 0) { 7855 struct file *f = mddev->bitmap_info.file; 7856 if (f) { 7857 spin_lock(&mddev->lock); 7858 mddev->bitmap_info.file = NULL; 7859 spin_unlock(&mddev->lock); 7860 fput(f); 7861 } 7862 } 7863 7864 return err; 7865} 7866 7867/* 7868 * md_set_array_info is used two different ways 7869 * The original usage is when creating a new array. 7870 * In this usage, raid_disks is > 0 and it together with 7871 * level, size, not_persistent,layout,chunksize determine the 7872 * shape of the array. 7873 * This will always create an array with a type-0.90.0 superblock. 7874 * The newer usage is when assembling an array. 7875 * In this case raid_disks will be 0, and the major_version field is 7876 * use to determine which style super-blocks are to be found on the devices. 7877 * The minor and patch _version numbers are also kept incase the 7878 * super_block handler wishes to interpret them. 7879 */ 7880int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7881{ 7882 if (info->raid_disks == 0) { 7883 /* just setting version number for superblock loading */ 7884 if (info->major_version < 0 || 7885 info->major_version >= ARRAY_SIZE(super_types) || 7886 super_types[info->major_version].name == NULL) { 7887 /* maybe try to auto-load a module? */ 7888 pr_warn("md: superblock version %d not known\n", 7889 info->major_version); 7890 return -EINVAL; 7891 } 7892 mddev->major_version = info->major_version; 7893 mddev->minor_version = info->minor_version; 7894 mddev->patch_version = info->patch_version; 7895 mddev->persistent = !info->not_persistent; 7896 /* ensure mddev_put doesn't delete this now that there 7897 * is some minimal configuration. 7898 */ 7899 mddev->ctime = ktime_get_real_seconds(); 7900 return 0; 7901 } 7902 mddev->major_version = MD_MAJOR_VERSION; 7903 mddev->minor_version = MD_MINOR_VERSION; 7904 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7905 mddev->ctime = ktime_get_real_seconds(); 7906 7907 mddev->level = info->level; 7908 mddev->clevel[0] = 0; 7909 mddev->dev_sectors = 2 * (sector_t)info->size; 7910 mddev->raid_disks = info->raid_disks; 7911 /* don't set md_minor, it is determined by which /dev/md* was 7912 * openned 7913 */ 7914 if (info->state & (1<<MD_SB_CLEAN)) 7915 mddev->resync_offset = MaxSector; 7916 else 7917 mddev->resync_offset = 0; 7918 mddev->persistent = ! info->not_persistent; 7919 mddev->external = 0; 7920 7921 mddev->layout = info->layout; 7922 if (mddev->level == 0) 7923 /* Cannot trust RAID0 layout info here */ 7924 mddev->layout = -1; 7925 mddev->chunk_sectors = info->chunk_size >> 9; 7926 7927 if (mddev->persistent) { 7928 mddev->max_disks = MD_SB_DISKS; 7929 mddev->flags = 0; 7930 mddev->sb_flags = 0; 7931 } 7932 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7933 7934 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7935 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7936 mddev->bitmap_info.offset = 0; 7937 7938 mddev->reshape_position = MaxSector; 7939 7940 /* 7941 * Generate a 128 bit UUID 7942 */ 7943 get_random_bytes(mddev->uuid, 16); 7944 7945 mddev->new_level = mddev->level; 7946 mddev->new_chunk_sectors = mddev->chunk_sectors; 7947 mddev->new_layout = mddev->layout; 7948 mddev->delta_disks = 0; 7949 mddev->reshape_backwards = 0; 7950 7951 return 0; 7952} 7953 7954void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7955{ 7956 lockdep_assert_held(&mddev->reconfig_mutex); 7957 7958 if (mddev->external_size) 7959 return; 7960 7961 mddev->array_sectors = array_sectors; 7962} 7963EXPORT_SYMBOL(md_set_array_sectors); 7964 7965static int update_size(struct mddev *mddev, sector_t num_sectors) 7966{ 7967 struct md_rdev *rdev; 7968 int rv; 7969 int fit = (num_sectors == 0); 7970 sector_t old_dev_sectors = mddev->dev_sectors; 7971 7972 if (mddev->pers->resize == NULL) 7973 return -EINVAL; 7974 /* The "num_sectors" is the number of sectors of each device that 7975 * is used. This can only make sense for arrays with redundancy. 7976 * linear and raid0 always use whatever space is available. We can only 7977 * consider changing this number if no resync or reconstruction is 7978 * happening, and if the new size is acceptable. It must fit before the 7979 * sb_start or, if that is <data_offset, it must fit before the size 7980 * of each device. If num_sectors is zero, we find the largest size 7981 * that fits. 7982 */ 7983 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7984 return -EBUSY; 7985 if (!md_is_rdwr(mddev)) 7986 return -EROFS; 7987 7988 rdev_for_each(rdev, mddev) { 7989 sector_t avail = rdev->sectors; 7990 7991 if (fit && (num_sectors == 0 || num_sectors > avail)) 7992 num_sectors = avail; 7993 if (avail < num_sectors) 7994 return -ENOSPC; 7995 } 7996 rv = mddev->pers->resize(mddev, num_sectors); 7997 if (!rv) { 7998 if (mddev_is_clustered(mddev)) 7999 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 8000 else if (!mddev_is_dm(mddev)) 8001 set_capacity_and_notify(mddev->gendisk, 8002 mddev->array_sectors); 8003 } 8004 return rv; 8005} 8006 8007static int update_raid_disks(struct mddev *mddev, int raid_disks) 8008{ 8009 int rv; 8010 struct md_rdev *rdev; 8011 /* change the number of raid disks */ 8012 if (mddev->pers->check_reshape == NULL) 8013 return -EINVAL; 8014 if (!md_is_rdwr(mddev)) 8015 return -EROFS; 8016 if (raid_disks <= 0 || 8017 (mddev->max_disks && raid_disks >= mddev->max_disks)) 8018 return -EINVAL; 8019 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 8020 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 8021 mddev->reshape_position != MaxSector) 8022 return -EBUSY; 8023 8024 rdev_for_each(rdev, mddev) { 8025 if (mddev->raid_disks < raid_disks && 8026 rdev->data_offset < rdev->new_data_offset) 8027 return -EINVAL; 8028 if (mddev->raid_disks > raid_disks && 8029 rdev->data_offset > rdev->new_data_offset) 8030 return -EINVAL; 8031 } 8032 8033 mddev->delta_disks = raid_disks - mddev->raid_disks; 8034 if (mddev->delta_disks < 0) 8035 mddev->reshape_backwards = 1; 8036 else if (mddev->delta_disks > 0) 8037 mddev->reshape_backwards = 0; 8038 8039 rv = mddev->pers->check_reshape(mddev); 8040 if (rv < 0) { 8041 mddev->delta_disks = 0; 8042 mddev->reshape_backwards = 0; 8043 } 8044 return rv; 8045} 8046 8047static int get_cluster_ops(struct mddev *mddev) 8048{ 8049 xa_lock(&md_submodule); 8050 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); 8051 if (mddev->cluster_ops && 8052 !try_module_get(mddev->cluster_ops->head.owner)) 8053 mddev->cluster_ops = NULL; 8054 xa_unlock(&md_submodule); 8055 8056 return mddev->cluster_ops == NULL ? -ENOENT : 0; 8057} 8058 8059static void put_cluster_ops(struct mddev *mddev) 8060{ 8061 if (!mddev->cluster_ops) 8062 return; 8063 8064 mddev->cluster_ops->leave(mddev); 8065 module_put(mddev->cluster_ops->head.owner); 8066 mddev->cluster_ops = NULL; 8067} 8068 8069/* 8070 * update_array_info is used to change the configuration of an 8071 * on-line array. 8072 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 8073 * fields in the info are checked against the array. 8074 * Any differences that cannot be handled will cause an error. 8075 * Normally, only one change can be managed at a time. 8076 */ 8077static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 8078{ 8079 int rv = 0; 8080 int cnt = 0; 8081 int state = 0; 8082 8083 /* calculate expected state,ignoring low bits */ 8084 if (mddev->bitmap && mddev->bitmap_info.offset) 8085 state |= (1 << MD_SB_BITMAP_PRESENT); 8086 8087 if (mddev->major_version != info->major_version || 8088 mddev->minor_version != info->minor_version || 8089/* mddev->patch_version != info->patch_version || */ 8090 mddev->ctime != info->ctime || 8091 mddev->level != info->level || 8092/* mddev->layout != info->layout || */ 8093 mddev->persistent != !info->not_persistent || 8094 mddev->chunk_sectors != info->chunk_size >> 9 || 8095 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 8096 ((state^info->state) & 0xfffffe00) 8097 ) 8098 return -EINVAL; 8099 /* Check there is only one change */ 8100 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 8101 cnt++; 8102 if (mddev->raid_disks != info->raid_disks) 8103 cnt++; 8104 if (mddev->layout != info->layout) 8105 cnt++; 8106 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 8107 cnt++; 8108 if (cnt == 0) 8109 return 0; 8110 if (cnt > 1) 8111 return -EINVAL; 8112 8113 if (mddev->layout != info->layout) { 8114 /* Change layout 8115 * we don't need to do anything at the md level, the 8116 * personality will take care of it all. 8117 */ 8118 if (mddev->pers->check_reshape == NULL) 8119 return -EINVAL; 8120 else { 8121 mddev->new_layout = info->layout; 8122 rv = mddev->pers->check_reshape(mddev); 8123 if (rv) 8124 mddev->new_layout = mddev->layout; 8125 return rv; 8126 } 8127 } 8128 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 8129 rv = update_size(mddev, (sector_t)info->size * 2); 8130 8131 if (mddev->raid_disks != info->raid_disks) 8132 rv = update_raid_disks(mddev, info->raid_disks); 8133 8134 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 8135 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 8136 rv = -EINVAL; 8137 goto err; 8138 } 8139 if (mddev->recovery || mddev->sync_thread) { 8140 rv = -EBUSY; 8141 goto err; 8142 } 8143 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 8144 /* add the bitmap */ 8145 if (mddev->bitmap) { 8146 rv = -EEXIST; 8147 goto err; 8148 } 8149 if (mddev->bitmap_info.default_offset == 0) { 8150 rv = -EINVAL; 8151 goto err; 8152 } 8153 mddev->bitmap_info.offset = 8154 mddev->bitmap_info.default_offset; 8155 mddev->bitmap_info.space = 8156 mddev->bitmap_info.default_space; 8157 mddev->bitmap_id = ID_BITMAP; 8158 rv = md_bitmap_create(mddev); 8159 if (!rv) 8160 rv = mddev->bitmap_ops->load(mddev); 8161 8162 if (rv) { 8163 md_bitmap_destroy(mddev); 8164 mddev->bitmap_info.offset = 0; 8165 md_bitmap_set_none(mddev); 8166 } 8167 } else { 8168 struct md_bitmap_stats stats; 8169 8170 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8171 if (rv) 8172 goto err; 8173 8174 if (stats.file) { 8175 rv = -EINVAL; 8176 goto err; 8177 } 8178 8179 if (mddev->bitmap_info.nodes) { 8180 /* hold PW on all the bitmap lock */ 8181 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { 8182 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 8183 rv = -EPERM; 8184 mddev->cluster_ops->unlock_all_bitmaps(mddev); 8185 goto err; 8186 } 8187 8188 mddev->bitmap_info.nodes = 0; 8189 put_cluster_ops(mddev); 8190 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 8191 } 8192 md_bitmap_destroy(mddev); 8193 mddev->bitmap_info.offset = 0; 8194 md_bitmap_set_none(mddev); 8195 } 8196 } 8197 md_update_sb(mddev, 1); 8198 return rv; 8199err: 8200 return rv; 8201} 8202 8203static int set_disk_faulty(struct mddev *mddev, dev_t dev) 8204{ 8205 struct md_rdev *rdev; 8206 int err = 0; 8207 8208 if (mddev->pers == NULL) 8209 return -ENODEV; 8210 8211 rcu_read_lock(); 8212 rdev = md_find_rdev_rcu(mddev, dev); 8213 if (!rdev) 8214 err = -ENODEV; 8215 else { 8216 md_error(mddev, rdev); 8217 if (test_bit(MD_BROKEN, &mddev->flags)) 8218 err = -EBUSY; 8219 } 8220 rcu_read_unlock(); 8221 return err; 8222} 8223 8224/* 8225 * We have a problem here : there is no easy way to give a CHS 8226 * virtual geometry. We currently pretend that we have a 2 heads 8227 * 4 sectors (with a BIG number of cylinders...). This drives 8228 * dosfs just mad... ;-) 8229 */ 8230static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) 8231{ 8232 struct mddev *mddev = disk->private_data; 8233 8234 geo->heads = 2; 8235 geo->sectors = 4; 8236 geo->cylinders = mddev->array_sectors / 8; 8237 return 0; 8238} 8239 8240static inline int md_ioctl_valid(unsigned int cmd) 8241{ 8242 switch (cmd) { 8243 case GET_ARRAY_INFO: 8244 case GET_DISK_INFO: 8245 case RAID_VERSION: 8246 return 0; 8247 case ADD_NEW_DISK: 8248 case GET_BITMAP_FILE: 8249 case HOT_ADD_DISK: 8250 case HOT_REMOVE_DISK: 8251 case RESTART_ARRAY_RW: 8252 case RUN_ARRAY: 8253 case SET_ARRAY_INFO: 8254 case SET_BITMAP_FILE: 8255 case SET_DISK_FAULTY: 8256 case STOP_ARRAY: 8257 case STOP_ARRAY_RO: 8258 case CLUSTERED_DISK_NACK: 8259 if (!capable(CAP_SYS_ADMIN)) 8260 return -EACCES; 8261 return 0; 8262 default: 8263 return -ENOTTY; 8264 } 8265} 8266 8267static bool md_ioctl_need_suspend(unsigned int cmd) 8268{ 8269 switch (cmd) { 8270 case ADD_NEW_DISK: 8271 case HOT_ADD_DISK: 8272 case HOT_REMOVE_DISK: 8273 case SET_BITMAP_FILE: 8274 case SET_ARRAY_INFO: 8275 return true; 8276 default: 8277 return false; 8278 } 8279} 8280 8281static int __md_set_array_info(struct mddev *mddev, void __user *argp) 8282{ 8283 mdu_array_info_t info; 8284 int err; 8285 8286 if (!argp) 8287 memset(&info, 0, sizeof(info)); 8288 else if (copy_from_user(&info, argp, sizeof(info))) 8289 return -EFAULT; 8290 8291 if (mddev->pers) { 8292 err = update_array_info(mddev, &info); 8293 if (err) 8294 pr_warn("md: couldn't update array info. %d\n", err); 8295 return err; 8296 } 8297 8298 if (!list_empty(&mddev->disks)) { 8299 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 8300 return -EBUSY; 8301 } 8302 8303 if (mddev->raid_disks) { 8304 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 8305 return -EBUSY; 8306 } 8307 8308 err = md_set_array_info(mddev, &info); 8309 if (err) 8310 pr_warn("md: couldn't set array info. %d\n", err); 8311 8312 return err; 8313} 8314 8315static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 8316 unsigned int cmd, unsigned long arg) 8317{ 8318 int err = 0; 8319 void __user *argp = (void __user *)arg; 8320 struct mddev *mddev = NULL; 8321 8322 err = md_ioctl_valid(cmd); 8323 if (err) 8324 return err; 8325 8326 /* 8327 * Commands dealing with the RAID driver but not any 8328 * particular array: 8329 */ 8330 if (cmd == RAID_VERSION) 8331 return get_version(argp); 8332 8333 /* 8334 * Commands creating/starting a new array: 8335 */ 8336 8337 mddev = bdev->bd_disk->private_data; 8338 8339 /* Some actions do not requires the mutex */ 8340 switch (cmd) { 8341 case GET_ARRAY_INFO: 8342 if (!mddev->raid_disks && !mddev->external) 8343 return -ENODEV; 8344 return get_array_info(mddev, argp); 8345 8346 case GET_DISK_INFO: 8347 if (!mddev->raid_disks && !mddev->external) 8348 return -ENODEV; 8349 return get_disk_info(mddev, argp); 8350 8351 case SET_DISK_FAULTY: 8352 return set_disk_faulty(mddev, new_decode_dev(arg)); 8353 8354 case GET_BITMAP_FILE: 8355 return get_bitmap_file(mddev, argp); 8356 } 8357 8358 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 8359 /* Need to flush page cache, and ensure no-one else opens 8360 * and writes 8361 */ 8362 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 8363 if (err) 8364 return err; 8365 } 8366 8367 if (!md_is_rdwr(mddev)) 8368 flush_work(&mddev->sync_work); 8369 8370 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 8371 mddev_lock(mddev); 8372 if (err) { 8373 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 8374 err, cmd); 8375 goto out; 8376 } 8377 8378 if (cmd == SET_ARRAY_INFO) { 8379 err = __md_set_array_info(mddev, argp); 8380 goto unlock; 8381 } 8382 8383 /* 8384 * Commands querying/configuring an existing array: 8385 */ 8386 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 8387 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 8388 if ((!mddev->raid_disks && !mddev->external) 8389 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 8390 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 8391 && cmd != GET_BITMAP_FILE) { 8392 err = -ENODEV; 8393 goto unlock; 8394 } 8395 8396 /* 8397 * Commands even a read-only array can execute: 8398 */ 8399 switch (cmd) { 8400 case RESTART_ARRAY_RW: 8401 err = restart_array(mddev); 8402 goto unlock; 8403 8404 case STOP_ARRAY: 8405 err = do_md_stop(mddev, 0); 8406 goto unlock; 8407 8408 case STOP_ARRAY_RO: 8409 if (mddev->pers) 8410 err = md_set_readonly(mddev); 8411 goto unlock; 8412 8413 case HOT_REMOVE_DISK: 8414 err = hot_remove_disk(mddev, new_decode_dev(arg)); 8415 goto unlock; 8416 8417 case ADD_NEW_DISK: 8418 /* We can support ADD_NEW_DISK on read-only arrays 8419 * only if we are re-adding a preexisting device. 8420 * So require mddev->pers and MD_DISK_SYNC. 8421 */ 8422 if (mddev->pers) { 8423 mdu_disk_info_t info; 8424 if (copy_from_user(&info, argp, sizeof(info))) 8425 err = -EFAULT; 8426 else if (!(info.state & (1<<MD_DISK_SYNC))) 8427 /* Need to clear read-only for this */ 8428 break; 8429 else 8430 err = md_add_new_disk(mddev, &info); 8431 goto unlock; 8432 } 8433 break; 8434 } 8435 8436 /* 8437 * The remaining ioctls are changing the state of the 8438 * superblock, so we do not allow them on read-only arrays. 8439 */ 8440 if (!md_is_rdwr(mddev) && mddev->pers) { 8441 if (mddev->ro != MD_AUTO_READ) { 8442 err = -EROFS; 8443 goto unlock; 8444 } 8445 mddev->ro = MD_RDWR; 8446 sysfs_notify_dirent_safe(mddev->sysfs_state); 8447 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8448 /* mddev_unlock will wake thread */ 8449 /* If a device failed while we were read-only, we 8450 * need to make sure the metadata is updated now. 8451 */ 8452 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 8453 mddev_unlock(mddev); 8454 wait_event(mddev->sb_wait, 8455 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 8456 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8457 mddev_lock_nointr(mddev); 8458 } 8459 } 8460 8461 switch (cmd) { 8462 case ADD_NEW_DISK: 8463 { 8464 mdu_disk_info_t info; 8465 if (copy_from_user(&info, argp, sizeof(info))) 8466 err = -EFAULT; 8467 else 8468 err = md_add_new_disk(mddev, &info); 8469 goto unlock; 8470 } 8471 8472 case CLUSTERED_DISK_NACK: 8473 if (mddev_is_clustered(mddev)) 8474 mddev->cluster_ops->new_disk_ack(mddev, false); 8475 else 8476 err = -EINVAL; 8477 goto unlock; 8478 8479 case HOT_ADD_DISK: 8480 err = hot_add_disk(mddev, new_decode_dev(arg)); 8481 goto unlock; 8482 8483 case RUN_ARRAY: 8484 err = do_md_run(mddev); 8485 goto unlock; 8486 8487 case SET_BITMAP_FILE: 8488 err = set_bitmap_file(mddev, (int)arg); 8489 goto unlock; 8490 8491 default: 8492 err = -EINVAL; 8493 goto unlock; 8494 } 8495 8496unlock: 8497 if (mddev->hold_active == UNTIL_IOCTL && 8498 err != -EINVAL) 8499 mddev->hold_active = 0; 8500 8501 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 8502 mddev_unlock(mddev); 8503 8504out: 8505 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 8506 clear_bit(MD_CLOSING, &mddev->flags); 8507 return err; 8508} 8509#ifdef CONFIG_COMPAT 8510static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 8511 unsigned int cmd, unsigned long arg) 8512{ 8513 switch (cmd) { 8514 case HOT_REMOVE_DISK: 8515 case HOT_ADD_DISK: 8516 case SET_DISK_FAULTY: 8517 case SET_BITMAP_FILE: 8518 /* These take in integer arg, do not convert */ 8519 break; 8520 default: 8521 arg = (unsigned long)compat_ptr(arg); 8522 break; 8523 } 8524 8525 return md_ioctl(bdev, mode, cmd, arg); 8526} 8527#endif /* CONFIG_COMPAT */ 8528 8529static int md_set_read_only(struct block_device *bdev, bool ro) 8530{ 8531 struct mddev *mddev = bdev->bd_disk->private_data; 8532 int err; 8533 8534 err = mddev_lock(mddev); 8535 if (err) 8536 return err; 8537 8538 if (!mddev->raid_disks && !mddev->external) { 8539 err = -ENODEV; 8540 goto out_unlock; 8541 } 8542 8543 /* 8544 * Transitioning to read-auto need only happen for arrays that call 8545 * md_write_start and which are not ready for writes yet. 8546 */ 8547 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 8548 err = restart_array(mddev); 8549 if (err) 8550 goto out_unlock; 8551 mddev->ro = MD_AUTO_READ; 8552 } 8553 8554out_unlock: 8555 mddev_unlock(mddev); 8556 return err; 8557} 8558 8559static int md_open(struct gendisk *disk, blk_mode_t mode) 8560{ 8561 struct mddev *mddev; 8562 int err; 8563 8564 spin_lock(&all_mddevs_lock); 8565 mddev = mddev_get(disk->private_data); 8566 spin_unlock(&all_mddevs_lock); 8567 if (!mddev) 8568 return -ENODEV; 8569 8570 err = mutex_lock_interruptible(&mddev->open_mutex); 8571 if (err) 8572 goto out; 8573 8574 err = -ENODEV; 8575 if (test_bit(MD_CLOSING, &mddev->flags)) 8576 goto out_unlock; 8577 8578 atomic_inc(&mddev->openers); 8579 mutex_unlock(&mddev->open_mutex); 8580 8581 disk_check_media_change(disk); 8582 return 0; 8583 8584out_unlock: 8585 mutex_unlock(&mddev->open_mutex); 8586out: 8587 mddev_put(mddev); 8588 return err; 8589} 8590 8591static void md_release(struct gendisk *disk) 8592{ 8593 struct mddev *mddev = disk->private_data; 8594 8595 BUG_ON(!mddev); 8596 atomic_dec(&mddev->openers); 8597 mddev_put(mddev); 8598} 8599 8600static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8601{ 8602 struct mddev *mddev = disk->private_data; 8603 unsigned int ret = 0; 8604 8605 if (mddev->changed) 8606 ret = DISK_EVENT_MEDIA_CHANGE; 8607 mddev->changed = 0; 8608 return ret; 8609} 8610 8611static void md_free_disk(struct gendisk *disk) 8612{ 8613 struct mddev *mddev = disk->private_data; 8614 8615 mddev_free(mddev); 8616} 8617 8618const struct block_device_operations md_fops = 8619{ 8620 .owner = THIS_MODULE, 8621 .submit_bio = md_submit_bio, 8622 .open = md_open, 8623 .release = md_release, 8624 .ioctl = md_ioctl, 8625#ifdef CONFIG_COMPAT 8626 .compat_ioctl = md_compat_ioctl, 8627#endif 8628 .getgeo = md_getgeo, 8629 .check_events = md_check_events, 8630 .set_read_only = md_set_read_only, 8631 .free_disk = md_free_disk, 8632}; 8633 8634static int md_thread(void *arg) 8635{ 8636 struct md_thread *thread = arg; 8637 8638 /* 8639 * md_thread is a 'system-thread', it's priority should be very 8640 * high. We avoid resource deadlocks individually in each 8641 * raid personality. (RAID5 does preallocation) We also use RR and 8642 * the very same RT priority as kswapd, thus we will never get 8643 * into a priority inversion deadlock. 8644 * 8645 * we definitely have to have equal or higher priority than 8646 * bdflush, otherwise bdflush will deadlock if there are too 8647 * many dirty RAID5 blocks. 8648 */ 8649 8650 allow_signal(SIGKILL); 8651 while (!kthread_should_stop()) { 8652 8653 /* We need to wait INTERRUPTIBLE so that 8654 * we don't add to the load-average. 8655 * That means we need to be sure no signals are 8656 * pending 8657 */ 8658 if (signal_pending(current)) 8659 flush_signals(current); 8660 8661 wait_event_interruptible_timeout 8662 (thread->wqueue, 8663 test_bit(THREAD_WAKEUP, &thread->flags) 8664 || kthread_should_stop() || kthread_should_park(), 8665 thread->timeout); 8666 8667 clear_bit(THREAD_WAKEUP, &thread->flags); 8668 if (kthread_should_park()) 8669 kthread_parkme(); 8670 if (!kthread_should_stop()) 8671 thread->run(thread); 8672 } 8673 8674 return 0; 8675} 8676 8677static void md_wakeup_thread_directly(struct md_thread __rcu **thread) 8678{ 8679 struct md_thread *t; 8680 8681 rcu_read_lock(); 8682 t = rcu_dereference(*thread); 8683 if (t) 8684 wake_up_process(t->tsk); 8685 rcu_read_unlock(); 8686} 8687 8688void __md_wakeup_thread(struct md_thread __rcu *thread) 8689{ 8690 struct md_thread *t; 8691 8692 t = rcu_dereference(thread); 8693 if (t) { 8694 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8695 set_bit(THREAD_WAKEUP, &t->flags); 8696 if (wq_has_sleeper(&t->wqueue)) 8697 wake_up(&t->wqueue); 8698 } 8699} 8700EXPORT_SYMBOL(__md_wakeup_thread); 8701 8702struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8703 struct mddev *mddev, const char *name) 8704{ 8705 struct md_thread *thread; 8706 8707 thread = kzalloc_obj(struct md_thread); 8708 if (!thread) 8709 return NULL; 8710 8711 init_waitqueue_head(&thread->wqueue); 8712 8713 thread->run = run; 8714 thread->mddev = mddev; 8715 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8716 thread->tsk = kthread_run(md_thread, thread, 8717 "%s_%s", 8718 mdname(thread->mddev), 8719 name); 8720 if (IS_ERR(thread->tsk)) { 8721 kfree(thread); 8722 return NULL; 8723 } 8724 return thread; 8725} 8726EXPORT_SYMBOL(md_register_thread); 8727 8728void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8729{ 8730 struct md_thread *thread = rcu_dereference_protected(*threadp, 8731 lockdep_is_held(&mddev->reconfig_mutex)); 8732 8733 if (!thread) 8734 return; 8735 8736 rcu_assign_pointer(*threadp, NULL); 8737 synchronize_rcu(); 8738 8739 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8740 kthread_stop(thread->tsk); 8741 kfree(thread); 8742} 8743EXPORT_SYMBOL(md_unregister_thread); 8744 8745void md_error(struct mddev *mddev, struct md_rdev *rdev) 8746{ 8747 if (!rdev || test_bit(Faulty, &rdev->flags)) 8748 return; 8749 8750 if (!mddev->pers || !mddev->pers->error_handler) 8751 return; 8752 mddev->pers->error_handler(mddev, rdev); 8753 8754 if (mddev->pers->head.id == ID_RAID0 || 8755 mddev->pers->head.id == ID_LINEAR) 8756 return; 8757 8758 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8759 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8760 sysfs_notify_dirent_safe(rdev->sysfs_state); 8761 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8762 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8763 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8764 md_wakeup_thread(mddev->thread); 8765 } 8766 if (mddev->event_work.func) 8767 queue_work(md_misc_wq, &mddev->event_work); 8768 md_new_event(); 8769} 8770EXPORT_SYMBOL(md_error); 8771 8772/* seq_file implementation /proc/mdstat */ 8773 8774static void status_unused(struct seq_file *seq) 8775{ 8776 int i = 0; 8777 struct md_rdev *rdev; 8778 8779 seq_printf(seq, "unused devices: "); 8780 8781 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8782 i++; 8783 seq_printf(seq, "%pg ", rdev->bdev); 8784 } 8785 if (!i) 8786 seq_printf(seq, "<none>"); 8787 8788 seq_printf(seq, "\n"); 8789} 8790 8791static void status_personalities(struct seq_file *seq) 8792{ 8793 struct md_submodule_head *head; 8794 unsigned long i; 8795 8796 seq_puts(seq, "Personalities : "); 8797 8798 xa_lock(&md_submodule); 8799 xa_for_each(&md_submodule, i, head) 8800 if (head->type == MD_PERSONALITY) 8801 seq_printf(seq, "[%s] ", head->name); 8802 xa_unlock(&md_submodule); 8803 8804 seq_puts(seq, "\n"); 8805} 8806 8807static int status_resync(struct seq_file *seq, struct mddev *mddev) 8808{ 8809 sector_t max_sectors, resync, res; 8810 unsigned long dt, db = 0; 8811 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8812 int scale, recovery_active; 8813 unsigned int per_milli; 8814 8815 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8816 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8817 max_sectors = mddev->resync_max_sectors; 8818 else 8819 max_sectors = mddev->dev_sectors; 8820 8821 resync = mddev->curr_resync; 8822 if (resync < MD_RESYNC_ACTIVE) { 8823 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8824 /* Still cleaning up */ 8825 resync = max_sectors; 8826 } else if (resync > max_sectors) { 8827 resync = max_sectors; 8828 } else { 8829 res = atomic_read(&mddev->recovery_active); 8830 /* 8831 * Resync has started, but the subtraction has overflowed or 8832 * yielded one of the special values. Force it to active to 8833 * ensure the status reports an active resync. 8834 */ 8835 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8836 resync = MD_RESYNC_ACTIVE; 8837 else 8838 resync -= res; 8839 } 8840 8841 if (resync == MD_RESYNC_NONE) { 8842 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8843 struct md_rdev *rdev; 8844 8845 rdev_for_each(rdev, mddev) 8846 if (rdev->raid_disk >= 0 && 8847 !test_bit(Faulty, &rdev->flags) && 8848 rdev->recovery_offset != MaxSector && 8849 rdev->recovery_offset) { 8850 seq_printf(seq, "\trecover=REMOTE"); 8851 return 1; 8852 } 8853 if (mddev->reshape_position != MaxSector) 8854 seq_printf(seq, "\treshape=REMOTE"); 8855 else 8856 seq_printf(seq, "\tresync=REMOTE"); 8857 return 1; 8858 } 8859 if (mddev->resync_offset < MaxSector) { 8860 seq_printf(seq, "\tresync=PENDING"); 8861 return 1; 8862 } 8863 return 0; 8864 } 8865 if (resync < MD_RESYNC_ACTIVE) { 8866 seq_printf(seq, "\tresync=DELAYED"); 8867 return 1; 8868 } 8869 8870 WARN_ON(max_sectors == 0); 8871 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8872 * in a sector_t, and (max_sectors>>scale) will fit in a 8873 * u32, as those are the requirements for sector_div. 8874 * Thus 'scale' must be at least 10 8875 */ 8876 scale = 10; 8877 if (sizeof(sector_t) > sizeof(unsigned long)) { 8878 while ( max_sectors/2 > (1ULL<<(scale+32))) 8879 scale++; 8880 } 8881 res = (resync>>scale)*1000; 8882 sector_div(res, (u32)((max_sectors>>scale)+1)); 8883 8884 per_milli = res; 8885 { 8886 int i, x = per_milli/50, y = 20-x; 8887 seq_printf(seq, "["); 8888 for (i = 0; i < x; i++) 8889 seq_printf(seq, "="); 8890 seq_printf(seq, ">"); 8891 for (i = 0; i < y; i++) 8892 seq_printf(seq, "."); 8893 seq_printf(seq, "] "); 8894 } 8895 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8896 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8897 "reshape" : 8898 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8899 "check" : 8900 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8901 "resync" : "recovery"))), 8902 per_milli/10, per_milli % 10, 8903 (unsigned long long) resync/2, 8904 (unsigned long long) max_sectors/2); 8905 8906 /* 8907 * dt: time from mark until now 8908 * db: blocks written from mark until now 8909 * rt: remaining time 8910 * 8911 * rt is a sector_t, which is always 64bit now. We are keeping 8912 * the original algorithm, but it is not really necessary. 8913 * 8914 * Original algorithm: 8915 * So we divide before multiply in case it is 32bit and close 8916 * to the limit. 8917 * We scale the divisor (db) by 32 to avoid losing precision 8918 * near the end of resync when the number of remaining sectors 8919 * is close to 'db'. 8920 * We then divide rt by 32 after multiplying by db to compensate. 8921 * The '+1' avoids division by zero if db is very small. 8922 */ 8923 dt = ((jiffies - mddev->resync_mark) / HZ); 8924 if (!dt) dt++; 8925 8926 curr_mark_cnt = mddev->curr_mark_cnt; 8927 recovery_active = atomic_read(&mddev->recovery_active); 8928 resync_mark_cnt = mddev->resync_mark_cnt; 8929 8930 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8931 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8932 8933 rt = max_sectors - resync; /* number of remaining sectors */ 8934 rt = div64_u64(rt, db/32+1); 8935 rt *= dt; 8936 rt >>= 5; 8937 8938 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8939 ((unsigned long)rt % 60)/6); 8940 8941 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8942 return 1; 8943} 8944 8945static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8946 __acquires(&all_mddevs_lock) 8947{ 8948 seq->poll_event = atomic_read(&md_event_count); 8949 spin_lock(&all_mddevs_lock); 8950 8951 return seq_list_start_head(&all_mddevs, *pos); 8952} 8953 8954static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8955{ 8956 return seq_list_next(v, &all_mddevs, pos); 8957} 8958 8959static void md_seq_stop(struct seq_file *seq, void *v) 8960 __releases(&all_mddevs_lock) 8961{ 8962 spin_unlock(&all_mddevs_lock); 8963} 8964 8965static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8966{ 8967 struct md_bitmap_stats stats; 8968 unsigned long used_pages; 8969 unsigned long chunk_kb; 8970 int err; 8971 8972 if (!md_bitmap_enabled(mddev, false)) 8973 return; 8974 8975 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8976 if (err) 8977 return; 8978 8979 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8980 used_pages = stats.pages - stats.missing_pages; 8981 8982 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8983 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8984 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8985 chunk_kb ? "KB" : "B"); 8986 8987 if (stats.file) { 8988 seq_puts(seq, ", file: "); 8989 seq_file_path(seq, stats.file, " \t\n"); 8990 } 8991 8992 seq_putc(seq, '\n'); 8993} 8994 8995static int md_seq_show(struct seq_file *seq, void *v) 8996{ 8997 struct mddev *mddev; 8998 sector_t sectors; 8999 struct md_rdev *rdev; 9000 9001 if (v == &all_mddevs) { 9002 status_personalities(seq); 9003 if (list_empty(&all_mddevs)) 9004 status_unused(seq); 9005 return 0; 9006 } 9007 9008 mddev = list_entry(v, struct mddev, all_mddevs); 9009 if (!mddev_get(mddev)) 9010 return 0; 9011 9012 spin_unlock(&all_mddevs_lock); 9013 9014 /* prevent bitmap to be freed after checking */ 9015 mutex_lock(&mddev->bitmap_info.mutex); 9016 9017 spin_lock(&mddev->lock); 9018 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 9019 seq_printf(seq, "%s : ", mdname(mddev)); 9020 if (mddev->pers) { 9021 if (test_bit(MD_BROKEN, &mddev->flags)) 9022 seq_printf(seq, "broken"); 9023 else 9024 seq_printf(seq, "active"); 9025 if (mddev->ro == MD_RDONLY) 9026 seq_printf(seq, " (read-only)"); 9027 if (mddev->ro == MD_AUTO_READ) 9028 seq_printf(seq, " (auto-read-only)"); 9029 seq_printf(seq, " %s", mddev->pers->head.name); 9030 } else { 9031 seq_printf(seq, "inactive"); 9032 } 9033 9034 sectors = 0; 9035 rcu_read_lock(); 9036 rdev_for_each_rcu(rdev, mddev) { 9037 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 9038 9039 if (test_bit(WriteMostly, &rdev->flags)) 9040 seq_printf(seq, "(W)"); 9041 if (test_bit(Journal, &rdev->flags)) 9042 seq_printf(seq, "(J)"); 9043 if (test_bit(Faulty, &rdev->flags)) { 9044 seq_printf(seq, "(F)"); 9045 continue; 9046 } 9047 if (rdev->raid_disk < 0) 9048 seq_printf(seq, "(S)"); /* spare */ 9049 if (test_bit(Replacement, &rdev->flags)) 9050 seq_printf(seq, "(R)"); 9051 sectors += rdev->sectors; 9052 } 9053 rcu_read_unlock(); 9054 9055 if (!list_empty(&mddev->disks)) { 9056 if (mddev->pers) 9057 seq_printf(seq, "\n %llu blocks", 9058 (unsigned long long) 9059 mddev->array_sectors / 2); 9060 else 9061 seq_printf(seq, "\n %llu blocks", 9062 (unsigned long long)sectors / 2); 9063 } 9064 if (mddev->persistent) { 9065 if (mddev->major_version != 0 || 9066 mddev->minor_version != 90) { 9067 seq_printf(seq," super %d.%d", 9068 mddev->major_version, 9069 mddev->minor_version); 9070 } 9071 } else if (mddev->external) 9072 seq_printf(seq, " super external:%s", 9073 mddev->metadata_type); 9074 else 9075 seq_printf(seq, " super non-persistent"); 9076 9077 if (mddev->pers) { 9078 mddev->pers->status(seq, mddev); 9079 seq_printf(seq, "\n "); 9080 if (mddev->pers->sync_request) { 9081 if (status_resync(seq, mddev)) 9082 seq_printf(seq, "\n "); 9083 } 9084 } else 9085 seq_printf(seq, "\n "); 9086 9087 md_bitmap_status(seq, mddev); 9088 9089 seq_printf(seq, "\n"); 9090 } 9091 spin_unlock(&mddev->lock); 9092 mutex_unlock(&mddev->bitmap_info.mutex); 9093 spin_lock(&all_mddevs_lock); 9094 9095 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 9096 status_unused(seq); 9097 9098 mddev_put_locked(mddev); 9099 return 0; 9100} 9101 9102static const struct seq_operations md_seq_ops = { 9103 .start = md_seq_start, 9104 .next = md_seq_next, 9105 .stop = md_seq_stop, 9106 .show = md_seq_show, 9107}; 9108 9109static int md_seq_open(struct inode *inode, struct file *file) 9110{ 9111 struct seq_file *seq; 9112 int error; 9113 9114 error = seq_open(file, &md_seq_ops); 9115 if (error) 9116 return error; 9117 9118 seq = file->private_data; 9119 seq->poll_event = atomic_read(&md_event_count); 9120 return error; 9121} 9122 9123static int md_unloading; 9124static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 9125{ 9126 struct seq_file *seq = filp->private_data; 9127 __poll_t mask; 9128 9129 if (md_unloading) 9130 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 9131 poll_wait(filp, &md_event_waiters, wait); 9132 9133 /* always allow read */ 9134 mask = EPOLLIN | EPOLLRDNORM; 9135 9136 if (seq->poll_event != atomic_read(&md_event_count)) 9137 mask |= EPOLLERR | EPOLLPRI; 9138 return mask; 9139} 9140 9141static const struct proc_ops mdstat_proc_ops = { 9142 .proc_open = md_seq_open, 9143 .proc_read = seq_read, 9144 .proc_lseek = seq_lseek, 9145 .proc_release = seq_release, 9146 .proc_poll = mdstat_poll, 9147}; 9148 9149int register_md_submodule(struct md_submodule_head *msh) 9150{ 9151 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); 9152} 9153EXPORT_SYMBOL_GPL(register_md_submodule); 9154 9155void unregister_md_submodule(struct md_submodule_head *msh) 9156{ 9157 xa_erase(&md_submodule, msh->id); 9158} 9159EXPORT_SYMBOL_GPL(unregister_md_submodule); 9160 9161int md_setup_cluster(struct mddev *mddev, int nodes) 9162{ 9163 int ret = get_cluster_ops(mddev); 9164 9165 if (ret) { 9166 request_module("md-cluster"); 9167 ret = get_cluster_ops(mddev); 9168 } 9169 9170 /* ensure module won't be unloaded */ 9171 if (ret) { 9172 pr_warn("can't find md-cluster module or get its reference.\n"); 9173 return ret; 9174 } 9175 9176 ret = mddev->cluster_ops->join(mddev, nodes); 9177 if (!ret) 9178 mddev->safemode_delay = 0; 9179 return ret; 9180} 9181 9182void md_cluster_stop(struct mddev *mddev) 9183{ 9184 put_cluster_ops(mddev); 9185} 9186 9187static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) 9188{ 9189 unsigned long last_events = rdev->last_events; 9190 9191 if (!bdev_is_partition(rdev->bdev)) 9192 return true; 9193 9194 /* 9195 * If rdev is partition, and user doesn't issue IO to the array, the 9196 * array is still not idle if user issues IO to other partitions. 9197 */ 9198 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, 9199 sectors) - 9200 part_stat_read_accum(rdev->bdev, sectors); 9201 9202 return init || rdev->last_events <= last_events; 9203} 9204 9205/* 9206 * mddev is idle if following conditions are matched since last check: 9207 * 1) mddev doesn't have normal IO completed; 9208 * 2) mddev doesn't have inflight normal IO; 9209 * 3) if any member disk is partition, and other partitions don't have IO 9210 * completed; 9211 * 9212 * Noted this checking rely on IO accounting is enabled. 9213 */ 9214static bool is_mddev_idle(struct mddev *mddev, int init) 9215{ 9216 unsigned long last_events = mddev->normal_io_events; 9217 struct gendisk *disk; 9218 struct md_rdev *rdev; 9219 bool idle = true; 9220 9221 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; 9222 if (!disk) 9223 return true; 9224 9225 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); 9226 if (!init && (mddev->normal_io_events > last_events || 9227 bdev_count_inflight(disk->part0))) 9228 idle = false; 9229 9230 rcu_read_lock(); 9231 rdev_for_each_rcu(rdev, mddev) 9232 if (!is_rdev_holder_idle(rdev, init)) 9233 idle = false; 9234 rcu_read_unlock(); 9235 9236 return idle; 9237} 9238 9239void md_done_sync(struct mddev *mddev, int blocks) 9240{ 9241 /* another "blocks" (512byte) blocks have been synced */ 9242 atomic_sub(blocks, &mddev->recovery_active); 9243 wake_up(&mddev->recovery_wait); 9244} 9245EXPORT_SYMBOL(md_done_sync); 9246 9247void md_sync_error(struct mddev *mddev) 9248{ 9249 // stop recovery, signal do_sync .... 9250 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9251 md_wakeup_thread(mddev->thread); 9252} 9253EXPORT_SYMBOL(md_sync_error); 9254 9255/* md_write_start(mddev, bi) 9256 * If we need to update some array metadata (e.g. 'active' flag 9257 * in superblock) before writing, schedule a superblock update 9258 * and wait for it to complete. 9259 * A return value of 'false' means that the write wasn't recorded 9260 * and cannot proceed as the array is being suspend. 9261 */ 9262void md_write_start(struct mddev *mddev, struct bio *bi) 9263{ 9264 int did_change = 0; 9265 9266 if (bio_data_dir(bi) != WRITE) 9267 return; 9268 9269 BUG_ON(mddev->ro == MD_RDONLY); 9270 if (mddev->ro == MD_AUTO_READ) { 9271 /* need to switch to read/write */ 9272 mddev->ro = MD_RDWR; 9273 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9274 md_wakeup_thread(mddev->thread); 9275 md_wakeup_thread(mddev->sync_thread); 9276 did_change = 1; 9277 } 9278 rcu_read_lock(); 9279 percpu_ref_get(&mddev->writes_pending); 9280 smp_mb(); /* Match smp_mb in set_in_sync() */ 9281 if (mddev->safemode == 1) 9282 mddev->safemode = 0; 9283 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 9284 if (mddev->in_sync || mddev->sync_checkers) { 9285 spin_lock(&mddev->lock); 9286 if (mddev->in_sync) { 9287 mddev->in_sync = 0; 9288 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9289 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9290 md_wakeup_thread(mddev->thread); 9291 did_change = 1; 9292 } 9293 spin_unlock(&mddev->lock); 9294 } 9295 rcu_read_unlock(); 9296 if (did_change) 9297 sysfs_notify_dirent_safe(mddev->sysfs_state); 9298 if (!test_bit(MD_HAS_SUPERBLOCK, &mddev->flags)) 9299 return; 9300 wait_event(mddev->sb_wait, 9301 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 9302} 9303EXPORT_SYMBOL(md_write_start); 9304 9305/* md_write_inc can only be called when md_write_start() has 9306 * already been called at least once of the current request. 9307 * It increments the counter and is useful when a single request 9308 * is split into several parts. Each part causes an increment and 9309 * so needs a matching md_write_end(). 9310 * Unlike md_write_start(), it is safe to call md_write_inc() inside 9311 * a spinlocked region. 9312 */ 9313void md_write_inc(struct mddev *mddev, struct bio *bi) 9314{ 9315 if (bio_data_dir(bi) != WRITE) 9316 return; 9317 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 9318 percpu_ref_get(&mddev->writes_pending); 9319} 9320EXPORT_SYMBOL(md_write_inc); 9321 9322void md_write_end(struct mddev *mddev) 9323{ 9324 percpu_ref_put(&mddev->writes_pending); 9325 9326 if (mddev->safemode == 2) 9327 md_wakeup_thread(mddev->thread); 9328 else if (mddev->safemode_delay) 9329 /* The roundup() ensures this only performs locking once 9330 * every ->safemode_delay jiffies 9331 */ 9332 mod_timer(&mddev->safemode_timer, 9333 roundup(jiffies, mddev->safemode_delay) + 9334 mddev->safemode_delay); 9335} 9336 9337EXPORT_SYMBOL(md_write_end); 9338 9339/* This is used by raid0 and raid10 */ 9340void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 9341 struct bio *bio, sector_t start, sector_t size) 9342{ 9343 struct bio *discard_bio = NULL; 9344 9345 __blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, &discard_bio); 9346 if (!discard_bio) 9347 return; 9348 9349 bio_chain(discard_bio, bio); 9350 bio_clone_blkg_association(discard_bio, bio); 9351 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 9352 submit_bio_noacct(discard_bio); 9353} 9354EXPORT_SYMBOL_GPL(md_submit_discard_bio); 9355 9356static void md_bitmap_start(struct mddev *mddev, 9357 struct md_io_clone *md_io_clone) 9358{ 9359 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? 9360 mddev->bitmap_ops->start_discard : 9361 mddev->bitmap_ops->start_write; 9362 9363 if (mddev->pers->bitmap_sector) 9364 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 9365 &md_io_clone->sectors); 9366 9367 fn(mddev, md_io_clone->offset, md_io_clone->sectors); 9368} 9369 9370static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 9371{ 9372 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? 9373 mddev->bitmap_ops->end_discard : 9374 mddev->bitmap_ops->end_write; 9375 9376 fn(mddev, md_io_clone->offset, md_io_clone->sectors); 9377} 9378 9379static void md_end_clone_io(struct bio *bio) 9380{ 9381 struct md_io_clone *md_io_clone = container_of(bio, struct md_io_clone, 9382 bio_clone); 9383 struct bio *orig_bio = md_io_clone->orig_bio; 9384 struct mddev *mddev = md_io_clone->mddev; 9385 struct completion *reshape_completion = bio->bi_private; 9386 9387 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) 9388 md_bitmap_end(mddev, md_io_clone); 9389 9390 if (bio->bi_status && !orig_bio->bi_status) 9391 orig_bio->bi_status = bio->bi_status; 9392 9393 if (md_io_clone->start_time) 9394 bio_end_io_acct(orig_bio, md_io_clone->start_time); 9395 9396 bio_put(bio); 9397 if (unlikely(reshape_completion)) 9398 complete(reshape_completion); 9399 else 9400 bio_endio(orig_bio); 9401 percpu_ref_put(&mddev->active_io); 9402} 9403 9404static void md_clone_bio(struct mddev *mddev, struct bio **bio) 9405{ 9406 struct block_device *bdev = (*bio)->bi_bdev; 9407 struct md_io_clone *md_io_clone; 9408 struct bio *clone = 9409 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 9410 9411 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 9412 md_io_clone->orig_bio = *bio; 9413 md_io_clone->mddev = mddev; 9414 if (blk_queue_io_stat(bdev->bd_disk->queue)) 9415 md_io_clone->start_time = bio_start_io_acct(*bio); 9416 9417 if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { 9418 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 9419 md_io_clone->sectors = bio_sectors(*bio); 9420 md_io_clone->rw = op_stat_group(bio_op(*bio)); 9421 md_bitmap_start(mddev, md_io_clone); 9422 } 9423 9424 clone->bi_end_io = md_end_clone_io; 9425 clone->bi_private = NULL; 9426 *bio = clone; 9427} 9428 9429void md_account_bio(struct mddev *mddev, struct bio **bio) 9430{ 9431 percpu_ref_get(&mddev->active_io); 9432 md_clone_bio(mddev, bio); 9433} 9434EXPORT_SYMBOL_GPL(md_account_bio); 9435 9436/* md_allow_write(mddev) 9437 * Calling this ensures that the array is marked 'active' so that writes 9438 * may proceed without blocking. It is important to call this before 9439 * attempting a GFP_KERNEL allocation while holding the mddev lock. 9440 * Must be called with mddev_lock held. 9441 */ 9442void md_allow_write(struct mddev *mddev) 9443{ 9444 if (!mddev->pers) 9445 return; 9446 if (!md_is_rdwr(mddev)) 9447 return; 9448 if (!mddev->pers->sync_request) 9449 return; 9450 9451 spin_lock(&mddev->lock); 9452 if (mddev->in_sync) { 9453 mddev->in_sync = 0; 9454 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9455 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9456 if (mddev->safemode_delay && 9457 mddev->safemode == 0) 9458 mddev->safemode = 1; 9459 spin_unlock(&mddev->lock); 9460 md_update_sb(mddev, 0); 9461 sysfs_notify_dirent_safe(mddev->sysfs_state); 9462 /* wait for the dirty state to be recorded in the metadata */ 9463 wait_event(mddev->sb_wait, 9464 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 9465 } else 9466 spin_unlock(&mddev->lock); 9467} 9468EXPORT_SYMBOL_GPL(md_allow_write); 9469 9470static sector_t md_sync_max_sectors(struct mddev *mddev, 9471 enum sync_action action) 9472{ 9473 switch (action) { 9474 case ACTION_RESYNC: 9475 case ACTION_CHECK: 9476 case ACTION_REPAIR: 9477 atomic64_set(&mddev->resync_mismatches, 0); 9478 fallthrough; 9479 case ACTION_RESHAPE: 9480 return mddev->resync_max_sectors; 9481 case ACTION_RECOVER: 9482 return mddev->dev_sectors; 9483 default: 9484 return 0; 9485 } 9486} 9487 9488/* 9489 * If lazy recovery is requested and all rdevs are in sync, select the rdev with 9490 * the higest index to perfore recovery to build initial xor data, this is the 9491 * same as old bitmap. 9492 */ 9493static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) 9494{ 9495 struct md_rdev *recover_rdev = NULL; 9496 struct md_rdev *rdev; 9497 bool ret = false; 9498 9499 rcu_read_lock(); 9500 rdev_for_each_rcu(rdev, mddev) { 9501 if (rdev->raid_disk < 0) 9502 continue; 9503 9504 if (test_bit(Faulty, &rdev->flags) || 9505 !test_bit(In_sync, &rdev->flags)) 9506 break; 9507 9508 if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) 9509 recover_rdev = rdev; 9510 } 9511 9512 if (recover_rdev) { 9513 clear_bit(In_sync, &recover_rdev->flags); 9514 ret = true; 9515 } 9516 9517 rcu_read_unlock(); 9518 return ret; 9519} 9520 9521static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 9522{ 9523 sector_t start = 0; 9524 struct md_rdev *rdev; 9525 9526 switch (action) { 9527 case ACTION_CHECK: 9528 case ACTION_REPAIR: 9529 return mddev->resync_min; 9530 case ACTION_RESYNC: 9531 if (!mddev->bitmap) 9532 return mddev->resync_offset; 9533 return 0; 9534 case ACTION_RESHAPE: 9535 /* 9536 * If the original node aborts reshaping then we continue the 9537 * reshaping, so set again to avoid restart reshape from the 9538 * first beginning 9539 */ 9540 if (mddev_is_clustered(mddev) && 9541 mddev->reshape_position != MaxSector) 9542 return mddev->reshape_position; 9543 return 0; 9544 case ACTION_RECOVER: 9545 start = MaxSector; 9546 rcu_read_lock(); 9547 rdev_for_each_rcu(rdev, mddev) 9548 if (rdev_needs_recovery(rdev, start)) 9549 start = rdev->recovery_offset; 9550 rcu_read_unlock(); 9551 9552 /* 9553 * If there are no spares, and raid456 lazy initial recover is 9554 * requested. 9555 */ 9556 if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && 9557 start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) 9558 start = 0; 9559 9560 /* If there is a bitmap, we need to make sure all 9561 * writes that started before we added a spare 9562 * complete before we start doing a recovery. 9563 * Otherwise the write might complete and (via 9564 * bitmap_endwrite) set a bit in the bitmap after the 9565 * recovery has checked that bit and skipped that 9566 * region. 9567 */ 9568 if (mddev->bitmap) { 9569 mddev->pers->quiesce(mddev, 1); 9570 mddev->pers->quiesce(mddev, 0); 9571 } 9572 return start; 9573 default: 9574 return MaxSector; 9575 } 9576} 9577 9578static bool sync_io_within_limit(struct mddev *mddev) 9579{ 9580 /* 9581 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's 9582 * RESYNC_PAGES(64k) per IO. 9583 */ 9584 return atomic_read(&mddev->recovery_active) < 9585 (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); 9586} 9587 9588/* 9589 * Update sync offset and mddev status when sync completes 9590 */ 9591static void md_finish_sync(struct mddev *mddev, enum sync_action action) 9592{ 9593 struct md_rdev *rdev; 9594 9595 switch (action) { 9596 case ACTION_RESYNC: 9597 case ACTION_REPAIR: 9598 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9599 mddev->curr_resync = MaxSector; 9600 mddev->resync_offset = mddev->curr_resync; 9601 break; 9602 case ACTION_RECOVER: 9603 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9604 mddev->curr_resync = MaxSector; 9605 rcu_read_lock(); 9606 rdev_for_each_rcu(rdev, mddev) 9607 if (mddev->delta_disks >= 0 && 9608 rdev_needs_recovery(rdev, mddev->curr_resync)) 9609 rdev->recovery_offset = mddev->curr_resync; 9610 rcu_read_unlock(); 9611 break; 9612 case ACTION_RESHAPE: 9613 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9614 mddev->delta_disks > 0 && 9615 mddev->pers->finish_reshape && 9616 mddev->pers->size && 9617 !mddev_is_dm(mddev)) { 9618 mddev_lock_nointr(mddev); 9619 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9620 mddev_unlock(mddev); 9621 if (!mddev_is_clustered(mddev)) 9622 set_capacity_and_notify(mddev->gendisk, 9623 mddev->array_sectors); 9624 } 9625 if (mddev->pers->finish_reshape) 9626 mddev->pers->finish_reshape(mddev); 9627 break; 9628 /* */ 9629 case ACTION_CHECK: 9630 default: 9631 break; 9632 } 9633} 9634 9635#define SYNC_MARKS 10 9636#define SYNC_MARK_STEP (3*HZ) 9637#define UPDATE_FREQUENCY (5*60*HZ) 9638void md_do_sync(struct md_thread *thread) 9639{ 9640 struct mddev *mddev = thread->mddev; 9641 struct mddev *mddev2; 9642 unsigned int currspeed = 0, window; 9643 sector_t max_sectors,j, io_sectors, recovery_done; 9644 unsigned long mark[SYNC_MARKS]; 9645 unsigned long update_time; 9646 sector_t mark_cnt[SYNC_MARKS]; 9647 int last_mark,m; 9648 sector_t last_check; 9649 int skipped = 0; 9650 enum sync_action action; 9651 const char *desc; 9652 struct blk_plug plug; 9653 int ret; 9654 9655 /* just incase thread restarts... */ 9656 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9657 return; 9658 9659 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9660 goto skip; 9661 9662 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 9663 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 9664 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9665 goto skip; 9666 } 9667 9668 if (mddev_is_clustered(mddev)) { 9669 ret = mddev->cluster_ops->resync_start(mddev); 9670 if (ret) 9671 goto skip; 9672 9673 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 9674 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 9675 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 9676 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 9677 && ((unsigned long long)mddev->curr_resync_completed 9678 < (unsigned long long)mddev->resync_max_sectors)) 9679 goto skip; 9680 } 9681 9682 action = md_sync_action(mddev); 9683 if (action == ACTION_FROZEN || action == ACTION_IDLE) { 9684 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9685 goto skip; 9686 } 9687 9688 desc = md_sync_action_name(action); 9689 mddev->last_sync_action = action; 9690 9691 /* 9692 * Before starting a resync we must have set curr_resync to 9693 * 2, and then checked that every "conflicting" array has curr_resync 9694 * less than ours. When we find one that is the same or higher 9695 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 9696 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 9697 * This will mean we have to start checking from the beginning again. 9698 * 9699 */ 9700 if (mddev_is_clustered(mddev)) 9701 mddev->cluster_ops->resync_start_notify(mddev); 9702 do { 9703 int mddev2_minor = -1; 9704 mddev->curr_resync = MD_RESYNC_DELAYED; 9705 9706 try_again: 9707 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9708 goto skip; 9709 spin_lock(&all_mddevs_lock); 9710 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9711 if (test_bit(MD_DELETED, &mddev2->flags)) 9712 continue; 9713 if (mddev2 == mddev) 9714 continue; 9715 if (!mddev->parallel_resync 9716 && mddev2->curr_resync 9717 && match_mddev_units(mddev, mddev2)) { 9718 DEFINE_WAIT(wq); 9719 if (mddev < mddev2 && 9720 mddev->curr_resync == MD_RESYNC_DELAYED) { 9721 /* arbitrarily yield */ 9722 mddev->curr_resync = MD_RESYNC_YIELDED; 9723 wake_up(&resync_wait); 9724 } 9725 if (mddev > mddev2 && 9726 mddev->curr_resync == MD_RESYNC_YIELDED) 9727 /* no need to wait here, we can wait the next 9728 * time 'round when curr_resync == 2 9729 */ 9730 continue; 9731 /* We need to wait 'interruptible' so as not to 9732 * contribute to the load average, and not to 9733 * be caught by 'softlockup' 9734 */ 9735 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9736 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9737 mddev2->curr_resync >= mddev->curr_resync) { 9738 if (mddev2_minor != mddev2->md_minor) { 9739 mddev2_minor = mddev2->md_minor; 9740 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9741 desc, mdname(mddev), 9742 mdname(mddev2)); 9743 } 9744 spin_unlock(&all_mddevs_lock); 9745 9746 if (signal_pending(current)) 9747 flush_signals(current); 9748 schedule(); 9749 finish_wait(&resync_wait, &wq); 9750 goto try_again; 9751 } 9752 finish_wait(&resync_wait, &wq); 9753 } 9754 } 9755 spin_unlock(&all_mddevs_lock); 9756 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9757 9758 max_sectors = md_sync_max_sectors(mddev, action); 9759 j = md_sync_position(mddev, action); 9760 9761 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9762 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9763 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9764 speed_max(mddev), desc); 9765 9766 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9767 9768 io_sectors = 0; 9769 for (m = 0; m < SYNC_MARKS; m++) { 9770 mark[m] = jiffies; 9771 mark_cnt[m] = io_sectors; 9772 } 9773 last_mark = 0; 9774 mddev->resync_mark = mark[last_mark]; 9775 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9776 9777 /* 9778 * Tune reconstruction: 9779 */ 9780 window = 32 * (PAGE_SIZE / 512); 9781 pr_debug("md: using %dk window, over a total of %lluk.\n", 9782 window/2, (unsigned long long)max_sectors/2); 9783 9784 atomic_set(&mddev->recovery_active, 0); 9785 last_check = 0; 9786 9787 if (j >= MD_RESYNC_ACTIVE) { 9788 pr_debug("md: resuming %s of %s from checkpoint.\n", 9789 desc, mdname(mddev)); 9790 mddev->curr_resync = j; 9791 } else 9792 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9793 mddev->curr_resync_completed = j; 9794 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9795 md_new_event(); 9796 update_time = jiffies; 9797 9798 blk_start_plug(&plug); 9799 while (j < max_sectors) { 9800 sector_t sectors; 9801 9802 skipped = 0; 9803 9804 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9805 ((mddev->curr_resync > mddev->curr_resync_completed && 9806 (mddev->curr_resync - mddev->curr_resync_completed) 9807 > (max_sectors >> 4)) || 9808 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9809 (j - mddev->curr_resync_completed)*2 9810 >= mddev->resync_max - mddev->curr_resync_completed || 9811 mddev->curr_resync_completed > mddev->resync_max 9812 )) { 9813 /* time to update curr_resync_completed */ 9814 wait_event(mddev->recovery_wait, 9815 atomic_read(&mddev->recovery_active) == 0); 9816 mddev->curr_resync_completed = j; 9817 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9818 j > mddev->resync_offset) 9819 mddev->resync_offset = j; 9820 update_time = jiffies; 9821 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9822 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9823 } 9824 9825 while (j >= mddev->resync_max && 9826 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9827 /* As this condition is controlled by user-space, 9828 * we can block indefinitely, so use '_interruptible' 9829 * to avoid triggering warnings. 9830 */ 9831 flush_signals(current); /* just in case */ 9832 wait_event_interruptible(mddev->recovery_wait, 9833 mddev->resync_max > j 9834 || test_bit(MD_RECOVERY_INTR, 9835 &mddev->recovery)); 9836 } 9837 9838 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9839 break; 9840 9841 if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { 9842 sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); 9843 if (sectors) 9844 goto update; 9845 } 9846 9847 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9848 &skipped); 9849 if (sectors == 0) { 9850 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9851 break; 9852 } 9853 9854 if (!skipped) { /* actual IO requested */ 9855 io_sectors += sectors; 9856 atomic_add(sectors, &mddev->recovery_active); 9857 } 9858 9859 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9860 break; 9861 9862update: 9863 j += sectors; 9864 if (j > max_sectors) 9865 /* when skipping, extra large numbers can be returned. */ 9866 j = max_sectors; 9867 if (j >= MD_RESYNC_ACTIVE) 9868 mddev->curr_resync = j; 9869 mddev->curr_mark_cnt = io_sectors; 9870 if (last_check == 0) 9871 /* this is the earliest that rebuild will be 9872 * visible in /proc/mdstat 9873 */ 9874 md_new_event(); 9875 9876 if (last_check + window > io_sectors || j == max_sectors) 9877 continue; 9878 9879 last_check = io_sectors; 9880 repeat: 9881 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9882 /* step marks */ 9883 int next = (last_mark+1) % SYNC_MARKS; 9884 9885 mddev->resync_mark = mark[next]; 9886 mddev->resync_mark_cnt = mark_cnt[next]; 9887 mark[next] = jiffies; 9888 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9889 last_mark = next; 9890 } 9891 9892 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9893 break; 9894 9895 /* 9896 * this loop exits only if either when we are slower than 9897 * the 'hard' speed limit, or the system was IO-idle for 9898 * a jiffy. 9899 * the system might be non-idle CPU-wise, but we only care 9900 * about not overloading the IO subsystem. (things like an 9901 * e2fsck being done on the RAID array should execute fast) 9902 */ 9903 cond_resched(); 9904 9905 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9906 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9907 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9908 9909 if (currspeed > speed_min(mddev)) { 9910 if (currspeed > speed_max(mddev)) { 9911 msleep(500); 9912 goto repeat; 9913 } 9914 if (!sync_io_within_limit(mddev) && 9915 !is_mddev_idle(mddev, 0)) { 9916 /* 9917 * Give other IO more of a chance. 9918 * The faster the devices, the less we wait. 9919 */ 9920 wait_event(mddev->recovery_wait, 9921 !atomic_read(&mddev->recovery_active)); 9922 } 9923 } 9924 } 9925 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9926 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9927 ? "interrupted" : "done"); 9928 /* 9929 * this also signals 'finished resyncing' to md_stop 9930 */ 9931 blk_finish_plug(&plug); 9932 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9933 9934 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9935 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9936 /* All sync IO completes after recovery_active becomes 0 */ 9937 mddev->curr_resync_completed = mddev->curr_resync; 9938 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9939 } 9940 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9941 9942 if (mddev->curr_resync > MD_RESYNC_ACTIVE) 9943 md_finish_sync(mddev, action); 9944 skip: 9945 /* set CHANGE_PENDING here since maybe another update is needed, 9946 * so other nodes are informed. It should be harmless for normal 9947 * raid */ 9948 set_mask_bits(&mddev->sb_flags, 0, 9949 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9950 spin_lock(&mddev->lock); 9951 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9952 /* We completed so min/max setting can be forgotten if used. */ 9953 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9954 mddev->resync_min = 0; 9955 mddev->resync_max = MaxSector; 9956 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9957 mddev->resync_min = mddev->curr_resync_completed; 9958 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9959 mddev->curr_resync = MD_RESYNC_NONE; 9960 spin_unlock(&mddev->lock); 9961 9962 wake_up(&resync_wait); 9963 md_wakeup_thread(mddev->thread); 9964 return; 9965} 9966EXPORT_SYMBOL_GPL(md_do_sync); 9967 9968static bool rdev_removeable(struct md_rdev *rdev) 9969{ 9970 /* rdev is not used. */ 9971 if (rdev->raid_disk < 0) 9972 return false; 9973 9974 /* There are still inflight io, don't remove this rdev. */ 9975 if (atomic_read(&rdev->nr_pending)) 9976 return false; 9977 9978 /* 9979 * An error occurred but has not yet been acknowledged by the metadata 9980 * handler, don't remove this rdev. 9981 */ 9982 if (test_bit(Blocked, &rdev->flags)) 9983 return false; 9984 9985 /* Fautly rdev is not used, it's safe to remove it. */ 9986 if (test_bit(Faulty, &rdev->flags)) 9987 return true; 9988 9989 /* Journal disk can only be removed if it's faulty. */ 9990 if (test_bit(Journal, &rdev->flags)) 9991 return false; 9992 9993 /* 9994 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9995 * replacement has just become active from pers->spare_active(), and 9996 * then pers->hot_remove_disk() will replace this rdev with replacement. 9997 */ 9998 if (!test_bit(In_sync, &rdev->flags)) 9999 return true; 10000 10001 return false; 10002} 10003 10004static bool rdev_is_spare(struct md_rdev *rdev) 10005{ 10006 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 10007 !test_bit(In_sync, &rdev->flags) && 10008 !test_bit(Journal, &rdev->flags) && 10009 !test_bit(Faulty, &rdev->flags); 10010} 10011 10012static bool rdev_addable(struct md_rdev *rdev) 10013{ 10014 struct mddev *mddev; 10015 10016 mddev = READ_ONCE(rdev->mddev); 10017 if (!mddev) 10018 return false; 10019 10020 /* rdev is already used, don't add it again. */ 10021 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 10022 test_bit(Faulty, &rdev->flags)) 10023 return false; 10024 10025 /* Allow to add journal disk. */ 10026 if (test_bit(Journal, &rdev->flags)) 10027 return true; 10028 10029 /* Allow to add if array is read-write. */ 10030 if (md_is_rdwr(mddev)) 10031 return true; 10032 10033 /* 10034 * For read-only array, only allow to readd a rdev. And if bitmap is 10035 * used, don't allow to readd a rdev that is too old. 10036 */ 10037 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 10038 return true; 10039 10040 return false; 10041} 10042 10043static bool md_spares_need_change(struct mddev *mddev) 10044{ 10045 struct md_rdev *rdev; 10046 10047 rcu_read_lock(); 10048 rdev_for_each_rcu(rdev, mddev) { 10049 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 10050 rcu_read_unlock(); 10051 return true; 10052 } 10053 } 10054 rcu_read_unlock(); 10055 return false; 10056} 10057 10058static int remove_spares(struct mddev *mddev, struct md_rdev *this) 10059{ 10060 struct md_rdev *rdev; 10061 int removed = 0; 10062 10063 rdev_for_each(rdev, mddev) { 10064 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 10065 !mddev->pers->hot_remove_disk(mddev, rdev)) { 10066 sysfs_unlink_rdev(mddev, rdev); 10067 rdev->saved_raid_disk = rdev->raid_disk; 10068 rdev->raid_disk = -1; 10069 removed++; 10070 } 10071 } 10072 10073 if (removed && mddev->kobj.sd) 10074 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10075 10076 return removed; 10077} 10078 10079static int remove_and_add_spares(struct mddev *mddev, 10080 struct md_rdev *this) 10081{ 10082 struct md_rdev *rdev; 10083 int spares = 0; 10084 int removed = 0; 10085 10086 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 10087 /* Mustn't remove devices when resync thread is running */ 10088 return 0; 10089 10090 removed = remove_spares(mddev, this); 10091 if (this && removed) 10092 goto no_add; 10093 10094 rdev_for_each(rdev, mddev) { 10095 if (this && this != rdev) 10096 continue; 10097 if (rdev_is_spare(rdev)) 10098 spares++; 10099 if (!rdev_addable(rdev)) 10100 continue; 10101 if (!test_bit(Journal, &rdev->flags)) 10102 rdev->recovery_offset = 0; 10103 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 10104 /* failure here is OK */ 10105 sysfs_link_rdev(mddev, rdev); 10106 if (!test_bit(Journal, &rdev->flags)) 10107 spares++; 10108 md_new_event(); 10109 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 10110 } 10111 } 10112no_add: 10113 if (removed) 10114 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 10115 return spares; 10116} 10117 10118static bool md_choose_sync_action(struct mddev *mddev, int *spares) 10119{ 10120 /* Check if reshape is in progress first. */ 10121 if (mddev->reshape_position != MaxSector) { 10122 if (mddev->pers->check_reshape == NULL || 10123 mddev->pers->check_reshape(mddev) != 0) 10124 return false; 10125 10126 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10127 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 10128 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10129 return true; 10130 } 10131 10132 /* Check if resync is in progress. */ 10133 if (mddev->resync_offset < MaxSector) { 10134 remove_spares(mddev, NULL); 10135 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10136 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 10137 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10138 return true; 10139 } 10140 10141 /* 10142 * Remove any failed drives, then add spares if possible. Spares are 10143 * also removed and re-added, to allow the personality to fail the 10144 * re-add. 10145 */ 10146 *spares = remove_and_add_spares(mddev, NULL); 10147 if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { 10148 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10149 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 10150 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 10151 10152 /* Start new recovery. */ 10153 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 10154 return true; 10155 } 10156 10157 /* Delay to choose resync/check/repair in md_do_sync(). */ 10158 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 10159 return true; 10160 10161 /* Nothing to be done */ 10162 return false; 10163} 10164 10165static void md_start_sync(struct work_struct *ws) 10166{ 10167 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 10168 int spares = 0; 10169 bool suspend = false; 10170 char *name; 10171 10172 /* 10173 * If reshape is still in progress, spares won't be added or removed 10174 * from conf until reshape is done. 10175 */ 10176 if (mddev->reshape_position == MaxSector && 10177 md_spares_need_change(mddev)) { 10178 suspend = true; 10179 mddev_suspend(mddev, false); 10180 } 10181 10182 mddev_lock_nointr(mddev); 10183 if (!md_is_rdwr(mddev)) { 10184 /* 10185 * On a read-only array we can: 10186 * - remove failed devices 10187 * - add already-in_sync devices if the array itself is in-sync. 10188 * As we only add devices that are already in-sync, we can 10189 * activate the spares immediately. 10190 */ 10191 remove_and_add_spares(mddev, NULL); 10192 goto not_running; 10193 } 10194 10195 if (!md_choose_sync_action(mddev, &spares)) 10196 goto not_running; 10197 10198 if (!mddev->pers->sync_request) 10199 goto not_running; 10200 10201 /* 10202 * We are adding a device or devices to an array which has the bitmap 10203 * stored on all devices. So make sure all bitmap pages get written. 10204 */ 10205 if (spares && md_bitmap_enabled(mddev, true)) 10206 mddev->bitmap_ops->write_all(mddev); 10207 10208 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 10209 "reshape" : "resync"; 10210 rcu_assign_pointer(mddev->sync_thread, 10211 md_register_thread(md_do_sync, mddev, name)); 10212 if (!mddev->sync_thread) { 10213 pr_warn("%s: could not start resync thread...\n", 10214 mdname(mddev)); 10215 /* leave the spares where they are, it shouldn't hurt */ 10216 goto not_running; 10217 } 10218 10219 mddev_unlock(mddev); 10220 /* 10221 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 10222 * not set it again. Otherwise, we may cause issue like this one: 10223 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 10224 * Therefore, use __mddev_resume(mddev, false). 10225 */ 10226 if (suspend) 10227 __mddev_resume(mddev, false); 10228 md_wakeup_thread(mddev->sync_thread); 10229 sysfs_notify_dirent_safe(mddev->sysfs_action); 10230 md_new_event(); 10231 return; 10232 10233not_running: 10234 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10235 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10236 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 10237 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 10238 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10239 mddev_unlock(mddev); 10240 /* 10241 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 10242 * not set it again. Otherwise, we may cause issue like this one: 10243 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 10244 * Therefore, use __mddev_resume(mddev, false). 10245 */ 10246 if (suspend) 10247 __mddev_resume(mddev, false); 10248 10249 wake_up(&resync_wait); 10250 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 10251 mddev->sysfs_action) 10252 sysfs_notify_dirent_safe(mddev->sysfs_action); 10253} 10254 10255static void unregister_sync_thread(struct mddev *mddev) 10256{ 10257 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 10258 /* resync/recovery still happening */ 10259 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10260 return; 10261 } 10262 10263 if (WARN_ON_ONCE(!mddev->sync_thread)) 10264 return; 10265 10266 md_reap_sync_thread(mddev); 10267} 10268 10269static bool md_should_do_recovery(struct mddev *mddev) 10270{ 10271 /* 10272 * As long as one of the following flags is set, 10273 * recovery needs to do or cleanup. 10274 */ 10275 if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 10276 test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 10277 return true; 10278 10279 /* 10280 * If no flags are set and it is in read-only status, 10281 * there is nothing to do. 10282 */ 10283 if (!md_is_rdwr(mddev)) 10284 return false; 10285 10286 /* 10287 * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to 10288 * active, and no action is needed for now. 10289 * All other MD_SB_* flags require to update the superblock. 10290 */ 10291 if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) 10292 return true; 10293 10294 /* 10295 * If the array is not using external metadata and there has been no data 10296 * written for some time, then the array's status needs to be set to 10297 * in_sync. 10298 */ 10299 if (mddev->external == 0 && mddev->safemode == 1) 10300 return true; 10301 10302 /* 10303 * When the system is about to restart or the process receives an signal, 10304 * the array needs to be synchronized as soon as possible. 10305 * Once the data synchronization is completed, need to change the array 10306 * status to in_sync. 10307 */ 10308 if (mddev->safemode == 2 && !mddev->in_sync && 10309 mddev->resync_offset == MaxSector) 10310 return true; 10311 10312 return false; 10313} 10314 10315/* 10316 * This routine is regularly called by all per-raid-array threads to 10317 * deal with generic issues like resync and super-block update. 10318 * Raid personalities that don't have a thread (linear/raid0) do not 10319 * need this as they never do any recovery or update the superblock. 10320 * 10321 * It does not do any resync itself, but rather "forks" off other threads 10322 * to do that as needed. 10323 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 10324 * "->recovery" and create a thread at ->sync_thread. 10325 * When the thread finishes it sets MD_RECOVERY_DONE 10326 * and wakeups up this thread which will reap the thread and finish up. 10327 * This thread also removes any faulty devices (with nr_pending == 0). 10328 * 10329 * The overall approach is: 10330 * 1/ if the superblock needs updating, update it. 10331 * 2/ If a recovery thread is running, don't do anything else. 10332 * 3/ If recovery has finished, clean up, possibly marking spares active. 10333 * 4/ If there are any faulty devices, remove them. 10334 * 5/ If array is degraded, try to add spares devices 10335 * 6/ If array has spares or is not in-sync, start a resync thread. 10336 */ 10337void md_check_recovery(struct mddev *mddev) 10338{ 10339 if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) 10340 mddev->bitmap_ops->daemon_work(mddev); 10341 10342 if (signal_pending(current)) { 10343 if (mddev->pers->sync_request && !mddev->external) { 10344 pr_debug("md: %s in immediate safe mode\n", 10345 mdname(mddev)); 10346 mddev->safemode = 2; 10347 } 10348 flush_signals(current); 10349 } 10350 10351 if (!md_should_do_recovery(mddev)) 10352 return; 10353 10354 if (mddev_trylock(mddev)) { 10355 bool try_set_sync = mddev->safemode != 0; 10356 10357 if (!mddev->external && mddev->safemode == 1) 10358 mddev->safemode = 0; 10359 10360 if (!md_is_rdwr(mddev)) { 10361 struct md_rdev *rdev; 10362 10363 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 10364 unregister_sync_thread(mddev); 10365 goto unlock; 10366 } 10367 10368 if (!mddev->external && mddev->in_sync) 10369 /* 10370 * 'Blocked' flag not needed as failed devices 10371 * will be recorded if array switched to read/write. 10372 * Leaving it set will prevent the device 10373 * from being removed. 10374 */ 10375 rdev_for_each(rdev, mddev) 10376 clear_bit(Blocked, &rdev->flags); 10377 10378 /* 10379 * There is no thread, but we need to call 10380 * ->spare_active and clear saved_raid_disk 10381 */ 10382 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 10383 md_reap_sync_thread(mddev); 10384 10385 /* 10386 * Let md_start_sync() to remove and add rdevs to the 10387 * array. 10388 */ 10389 if (md_spares_need_change(mddev)) { 10390 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10391 queue_work(md_misc_wq, &mddev->sync_work); 10392 } 10393 10394 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 10395 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10396 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10397 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 10398 10399 goto unlock; 10400 } 10401 10402 if (mddev_is_clustered(mddev)) { 10403 struct md_rdev *rdev, *tmp; 10404 /* kick the device if another node issued a 10405 * remove disk. 10406 */ 10407 rdev_for_each_safe(rdev, tmp, mddev) { 10408 if (rdev->raid_disk < 0 && 10409 test_and_clear_bit(ClusterRemove, &rdev->flags)) 10410 md_kick_rdev_from_array(rdev); 10411 } 10412 } 10413 10414 if (try_set_sync && !mddev->external && !mddev->in_sync) { 10415 spin_lock(&mddev->lock); 10416 set_in_sync(mddev); 10417 spin_unlock(&mddev->lock); 10418 } 10419 10420 if (mddev->sb_flags) 10421 md_update_sb(mddev, 0); 10422 10423 /* 10424 * Never start a new sync thread if MD_RECOVERY_RUNNING is 10425 * still set. 10426 */ 10427 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 10428 unregister_sync_thread(mddev); 10429 goto unlock; 10430 } 10431 10432 /* Set RUNNING before clearing NEEDED to avoid 10433 * any transients in the value of "sync_action". 10434 */ 10435 mddev->curr_resync_completed = 0; 10436 spin_lock(&mddev->lock); 10437 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10438 spin_unlock(&mddev->lock); 10439 /* Clear some bits that don't mean anything, but 10440 * might be left set 10441 */ 10442 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 10443 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 10444 10445 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 10446 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 10447 queue_work(md_misc_wq, &mddev->sync_work); 10448 } else { 10449 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10450 wake_up(&resync_wait); 10451 } 10452 10453 unlock: 10454 wake_up(&mddev->sb_wait); 10455 mddev_unlock(mddev); 10456 } 10457} 10458EXPORT_SYMBOL(md_check_recovery); 10459 10460void md_reap_sync_thread(struct mddev *mddev) 10461{ 10462 struct md_rdev *rdev; 10463 sector_t old_dev_sectors = mddev->dev_sectors; 10464 bool is_reshaped = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10465 10466 /* resync has finished, collect result */ 10467 md_unregister_thread(mddev, &mddev->sync_thread); 10468 atomic_inc(&mddev->sync_seq); 10469 10470 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 10471 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 10472 mddev->degraded != mddev->raid_disks) { 10473 /* success...*/ 10474 /* activate any spares */ 10475 if (mddev->pers->spare_active(mddev)) { 10476 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10477 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 10478 } 10479 } 10480 10481 /* If array is no-longer degraded, then any saved_raid_disk 10482 * information must be scrapped. 10483 */ 10484 if (!mddev->degraded) 10485 rdev_for_each(rdev, mddev) 10486 rdev->saved_raid_disk = -1; 10487 10488 md_update_sb(mddev, 1); 10489 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 10490 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 10491 * clustered raid */ 10492 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 10493 mddev->cluster_ops->resync_finish(mddev); 10494 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10495 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 10496 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10497 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10498 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 10499 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 10500 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10501 /* 10502 * We call mddev->cluster_ops->update_size here because sync_size could 10503 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 10504 * so it is time to update size across cluster. 10505 */ 10506 if (mddev_is_clustered(mddev) && is_reshaped && 10507 mddev->pers->finish_reshape && 10508 !test_bit(MD_CLOSING, &mddev->flags)) 10509 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 10510 /* flag recovery needed just to double check */ 10511 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10512 sysfs_notify_dirent_safe(mddev->sysfs_completed); 10513 sysfs_notify_dirent_safe(mddev->sysfs_action); 10514 md_new_event(); 10515 if (mddev->event_work.func) 10516 queue_work(md_misc_wq, &mddev->event_work); 10517 wake_up(&resync_wait); 10518} 10519EXPORT_SYMBOL(md_reap_sync_thread); 10520 10521void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 10522{ 10523 sysfs_notify_dirent_safe(rdev->sysfs_state); 10524 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 10525 msecs_to_jiffies(5000)); 10526 rdev_dec_pending(rdev, mddev); 10527} 10528EXPORT_SYMBOL(md_wait_for_blocked_rdev); 10529 10530void md_finish_reshape(struct mddev *mddev) 10531{ 10532 /* called be personality module when reshape completes. */ 10533 struct md_rdev *rdev; 10534 10535 rdev_for_each(rdev, mddev) { 10536 if (rdev->data_offset > rdev->new_data_offset) 10537 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 10538 else 10539 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 10540 rdev->data_offset = rdev->new_data_offset; 10541 } 10542} 10543EXPORT_SYMBOL(md_finish_reshape); 10544 10545/* Bad block management */ 10546 10547/* Returns true on success, false on failure */ 10548bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10549 int is_new) 10550{ 10551 struct mddev *mddev = rdev->mddev; 10552 10553 /* 10554 * Recording new badblocks for faulty rdev will force unnecessary 10555 * super block updating. This is fragile for external management because 10556 * userspace daemon may trying to remove this device and deadlock may 10557 * occur. This will be probably solved in the mdadm, but it is safer to 10558 * avoid it. 10559 */ 10560 if (test_bit(Faulty, &rdev->flags)) 10561 return true; 10562 10563 if (is_new) 10564 s += rdev->new_data_offset; 10565 else 10566 s += rdev->data_offset; 10567 10568 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) { 10569 /* 10570 * Mark the disk as Faulty when setting badblocks fails, 10571 * otherwise, bad sectors may be read. 10572 */ 10573 md_error(mddev, rdev); 10574 return false; 10575 } 10576 10577 /* Make sure they get written out promptly */ 10578 if (test_bit(ExternalBbl, &rdev->flags)) 10579 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 10580 sysfs_notify_dirent_safe(rdev->sysfs_state); 10581 set_mask_bits(&mddev->sb_flags, 0, 10582 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 10583 md_wakeup_thread(rdev->mddev->thread); 10584 return true; 10585} 10586EXPORT_SYMBOL_GPL(rdev_set_badblocks); 10587 10588void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10589 int is_new) 10590{ 10591 if (is_new) 10592 s += rdev->new_data_offset; 10593 else 10594 s += rdev->data_offset; 10595 10596 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 10597 return; 10598 10599 if (test_bit(ExternalBbl, &rdev->flags)) 10600 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 10601} 10602EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 10603 10604static int md_notify_reboot(struct notifier_block *this, 10605 unsigned long code, void *x) 10606{ 10607 struct mddev *mddev; 10608 10609 spin_lock(&all_mddevs_lock); 10610 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10611 if (!mddev_get(mddev)) 10612 continue; 10613 spin_unlock(&all_mddevs_lock); 10614 if (mddev_trylock(mddev)) { 10615 if (mddev->pers) 10616 __md_stop_writes(mddev); 10617 if (mddev->persistent) 10618 mddev->safemode = 2; 10619 mddev_unlock(mddev); 10620 } 10621 spin_lock(&all_mddevs_lock); 10622 mddev_put_locked(mddev); 10623 } 10624 spin_unlock(&all_mddevs_lock); 10625 10626 return NOTIFY_DONE; 10627} 10628 10629static struct notifier_block md_notifier = { 10630 .notifier_call = md_notify_reboot, 10631 .next = NULL, 10632 .priority = INT_MAX, /* before any real devices */ 10633}; 10634 10635static void md_geninit(void) 10636{ 10637 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 10638 10639 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 10640} 10641 10642static int __init md_init(void) 10643{ 10644 int ret = md_bitmap_init(); 10645 10646 if (ret) 10647 return ret; 10648 10649 ret = md_llbitmap_init(); 10650 if (ret) 10651 goto err_bitmap; 10652 10653 ret = -ENOMEM; 10654 md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0); 10655 if (!md_misc_wq) 10656 goto err_misc_wq; 10657 10658 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 10659 if (ret < 0) 10660 goto err_md; 10661 10662 ret = __register_blkdev(0, "mdp", md_probe); 10663 if (ret < 0) 10664 goto err_mdp; 10665 mdp_major = ret; 10666 10667 register_reboot_notifier(&md_notifier); 10668 raid_table_header = register_sysctl("dev/raid", raid_table); 10669 10670 md_geninit(); 10671 return 0; 10672 10673err_mdp: 10674 unregister_blkdev(MD_MAJOR, "md"); 10675err_md: 10676 destroy_workqueue(md_misc_wq); 10677err_misc_wq: 10678 md_llbitmap_exit(); 10679err_bitmap: 10680 md_bitmap_exit(); 10681 return ret; 10682} 10683 10684static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 10685{ 10686 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 10687 struct md_rdev *rdev2, *tmp; 10688 int role, ret; 10689 10690 /* 10691 * If size is changed in another node then we need to 10692 * do resize as well. 10693 */ 10694 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 10695 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 10696 if (ret) 10697 pr_info("md-cluster: resize failed\n"); 10698 else if (md_bitmap_enabled(mddev, false)) 10699 mddev->bitmap_ops->update_sb(mddev->bitmap); 10700 } 10701 10702 /* Check for change of roles in the active devices */ 10703 rdev_for_each_safe(rdev2, tmp, mddev) { 10704 if (test_bit(Faulty, &rdev2->flags)) { 10705 if (test_bit(ClusterRemove, &rdev2->flags)) 10706 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10707 continue; 10708 } 10709 10710 /* Check if the roles changed */ 10711 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10712 10713 if (test_bit(Candidate, &rdev2->flags)) { 10714 if (role == MD_DISK_ROLE_FAULTY) { 10715 pr_info("md: Removing Candidate device %pg because add failed\n", 10716 rdev2->bdev); 10717 md_kick_rdev_from_array(rdev2); 10718 continue; 10719 } 10720 else 10721 clear_bit(Candidate, &rdev2->flags); 10722 } 10723 10724 if (role != rdev2->raid_disk) { 10725 /* 10726 * got activated except reshape is happening. 10727 */ 10728 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10729 !(le32_to_cpu(sb->feature_map) & 10730 MD_FEATURE_RESHAPE_ACTIVE) && 10731 !mddev->cluster_ops->resync_status_get(mddev)) { 10732 /* 10733 * -1 to make raid1_add_disk() set conf->fullsync 10734 * to 1. This could avoid skipping sync when the 10735 * remote node is down during resyncing. 10736 */ 10737 if ((le32_to_cpu(sb->feature_map) 10738 & MD_FEATURE_RECOVERY_OFFSET)) 10739 rdev2->saved_raid_disk = -1; 10740 else 10741 rdev2->saved_raid_disk = role; 10742 ret = remove_and_add_spares(mddev, rdev2); 10743 pr_info("Activated spare: %pg\n", 10744 rdev2->bdev); 10745 /* wakeup mddev->thread here, so array could 10746 * perform resync with the new activated disk */ 10747 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10748 md_wakeup_thread(mddev->thread); 10749 } 10750 /* device faulty 10751 * We just want to do the minimum to mark the disk 10752 * as faulty. The recovery is performed by the 10753 * one who initiated the error. 10754 */ 10755 if (role == MD_DISK_ROLE_FAULTY || 10756 role == MD_DISK_ROLE_JOURNAL) { 10757 md_error(mddev, rdev2); 10758 clear_bit(Blocked, &rdev2->flags); 10759 } 10760 } 10761 } 10762 10763 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10764 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10765 if (ret) 10766 pr_warn("md: updating array disks failed. %d\n", ret); 10767 } 10768 10769 /* 10770 * Since mddev->delta_disks has already updated in update_raid_disks, 10771 * so it is time to check reshape. 10772 */ 10773 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10774 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10775 /* 10776 * reshape is happening in the remote node, we need to 10777 * update reshape_position and call start_reshape. 10778 */ 10779 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10780 if (mddev->pers->update_reshape_pos) 10781 mddev->pers->update_reshape_pos(mddev); 10782 if (mddev->pers->start_reshape) 10783 mddev->pers->start_reshape(mddev); 10784 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10785 mddev->reshape_position != MaxSector && 10786 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10787 /* reshape is just done in another node. */ 10788 mddev->reshape_position = MaxSector; 10789 if (mddev->pers->update_reshape_pos) 10790 mddev->pers->update_reshape_pos(mddev); 10791 } 10792 10793 /* Finally set the event to be up to date */ 10794 mddev->events = le64_to_cpu(sb->events); 10795} 10796 10797static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10798{ 10799 int err; 10800 struct page *swapout = rdev->sb_page; 10801 struct mdp_superblock_1 *sb; 10802 10803 /* Store the sb page of the rdev in the swapout temporary 10804 * variable in case we err in the future 10805 */ 10806 rdev->sb_page = NULL; 10807 err = alloc_disk_sb(rdev); 10808 if (err == 0) { 10809 ClearPageUptodate(rdev->sb_page); 10810 rdev->sb_loaded = 0; 10811 err = super_types[mddev->major_version]. 10812 load_super(rdev, NULL, mddev->minor_version); 10813 } 10814 if (err < 0) { 10815 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10816 __func__, __LINE__, rdev->desc_nr, err); 10817 if (rdev->sb_page) 10818 put_page(rdev->sb_page); 10819 rdev->sb_page = swapout; 10820 rdev->sb_loaded = 1; 10821 return err; 10822 } 10823 10824 sb = page_address(rdev->sb_page); 10825 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10826 * is not set 10827 */ 10828 10829 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10830 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10831 10832 /* The other node finished recovery, call spare_active to set 10833 * device In_sync and mddev->degraded 10834 */ 10835 if (rdev->recovery_offset == MaxSector && 10836 !test_bit(In_sync, &rdev->flags) && 10837 mddev->pers->spare_active(mddev)) 10838 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10839 10840 put_page(swapout); 10841 return 0; 10842} 10843 10844void md_reload_sb(struct mddev *mddev, int nr) 10845{ 10846 struct md_rdev *rdev = NULL, *iter; 10847 int err; 10848 10849 /* Find the rdev */ 10850 rdev_for_each_rcu(iter, mddev) { 10851 if (iter->desc_nr == nr) { 10852 rdev = iter; 10853 break; 10854 } 10855 } 10856 10857 if (!rdev) { 10858 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10859 return; 10860 } 10861 10862 err = read_rdev(mddev, rdev); 10863 if (err < 0) 10864 return; 10865 10866 check_sb_changes(mddev, rdev); 10867 10868 /* Read all rdev's to update recovery_offset */ 10869 rdev_for_each_rcu(rdev, mddev) { 10870 if (!test_bit(Faulty, &rdev->flags)) 10871 read_rdev(mddev, rdev); 10872 } 10873} 10874EXPORT_SYMBOL(md_reload_sb); 10875 10876#ifndef MODULE 10877 10878/* 10879 * Searches all registered partitions for autorun RAID arrays 10880 * at boot time. 10881 */ 10882 10883static DEFINE_MUTEX(detected_devices_mutex); 10884static LIST_HEAD(all_detected_devices); 10885struct detected_devices_node { 10886 struct list_head list; 10887 dev_t dev; 10888}; 10889 10890void md_autodetect_dev(dev_t dev) 10891{ 10892 struct detected_devices_node *node_detected_dev; 10893 10894 node_detected_dev = kzalloc_obj(*node_detected_dev); 10895 if (node_detected_dev) { 10896 node_detected_dev->dev = dev; 10897 mutex_lock(&detected_devices_mutex); 10898 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10899 mutex_unlock(&detected_devices_mutex); 10900 } 10901} 10902 10903void md_autostart_arrays(int part) 10904{ 10905 struct md_rdev *rdev; 10906 struct detected_devices_node *node_detected_dev; 10907 dev_t dev; 10908 int i_scanned, i_passed; 10909 10910 i_scanned = 0; 10911 i_passed = 0; 10912 10913 pr_info("md: Autodetecting RAID arrays.\n"); 10914 10915 mutex_lock(&detected_devices_mutex); 10916 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10917 i_scanned++; 10918 node_detected_dev = list_entry(all_detected_devices.next, 10919 struct detected_devices_node, list); 10920 list_del(&node_detected_dev->list); 10921 dev = node_detected_dev->dev; 10922 kfree(node_detected_dev); 10923 mutex_unlock(&detected_devices_mutex); 10924 rdev = md_import_device(dev,0, 90); 10925 mutex_lock(&detected_devices_mutex); 10926 if (IS_ERR(rdev)) 10927 continue; 10928 10929 if (test_bit(Faulty, &rdev->flags)) 10930 continue; 10931 10932 set_bit(AutoDetected, &rdev->flags); 10933 list_add(&rdev->same_set, &pending_raid_disks); 10934 i_passed++; 10935 } 10936 mutex_unlock(&detected_devices_mutex); 10937 10938 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10939 10940 autorun_devices(part); 10941} 10942 10943#endif /* !MODULE */ 10944 10945static __exit void md_exit(void) 10946{ 10947 struct mddev *mddev; 10948 int delay = 1; 10949 10950 unregister_blkdev(MD_MAJOR,"md"); 10951 unregister_blkdev(mdp_major, "mdp"); 10952 unregister_reboot_notifier(&md_notifier); 10953 unregister_sysctl_table(raid_table_header); 10954 10955 /* We cannot unload the modules while some process is 10956 * waiting for us in select() or poll() - wake them up 10957 */ 10958 md_unloading = 1; 10959 while (waitqueue_active(&md_event_waiters)) { 10960 /* not safe to leave yet */ 10961 wake_up(&md_event_waiters); 10962 msleep(delay); 10963 delay += delay; 10964 } 10965 remove_proc_entry("mdstat", NULL); 10966 10967 spin_lock(&all_mddevs_lock); 10968 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10969 if (!mddev_get(mddev)) 10970 continue; 10971 spin_unlock(&all_mddevs_lock); 10972 export_array(mddev); 10973 mddev->ctime = 0; 10974 mddev->hold_active = 0; 10975 /* 10976 * As the mddev is now fully clear, mddev_put will schedule 10977 * the mddev for destruction by a workqueue, and the 10978 * destroy_workqueue() below will wait for that to complete. 10979 */ 10980 spin_lock(&all_mddevs_lock); 10981 mddev_put_locked(mddev); 10982 } 10983 spin_unlock(&all_mddevs_lock); 10984 10985 destroy_workqueue(md_misc_wq); 10986 md_bitmap_exit(); 10987} 10988 10989subsys_initcall(md_init); 10990module_exit(md_exit) 10991 10992static int get_ro(char *buffer, const struct kernel_param *kp) 10993{ 10994 return sprintf(buffer, "%d\n", start_readonly); 10995} 10996static int set_ro(const char *val, const struct kernel_param *kp) 10997{ 10998 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10999} 11000 11001module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 11002module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 11003module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 11004module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 11005module_param(legacy_async_del_gendisk, bool, 0600); 11006module_param(check_new_feature, bool, 0600); 11007 11008MODULE_LICENSE("GPL"); 11009MODULE_DESCRIPTION("MD RAID framework"); 11010MODULE_ALIAS("md"); 11011MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);