Merge tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

+5

drivers/md/md.c

··· 5291 5291 if (start_readonly && mddev->ro == 0) 5292 5292 mddev->ro = 2; /* read-only, but switch on first write */ 5293 5293 5294 + /* 5295 + * NOTE: some pers->run(), for example r5l_recovery_log(), wakes 5296 + * up mddev->thread. It is important to initialize critical 5297 + * resources for mddev->thread BEFORE calling pers->run(). 5298 + */ 5294 5299 err = pers->run(mddev); 5295 5300 if (err) 5296 5301 pr_warn("md: pers->run() failed ...\n");

+88 -18

drivers/md/raid5-cache.c

··· 162 162 163 163 /* to submit async io_units, to fulfill ordering of flush */ 164 164 struct work_struct deferred_io_work; 165 + /* to disable write back during in degraded mode */ 166 + struct work_struct disable_writeback_work; 165 167 }; 166 168 167 169 /* ··· 611 609 spin_unlock_irqrestore(&log->io_list_lock, flags); 612 610 if (io) 613 611 r5l_do_submit_io(log, io); 612 + } 613 + 614 + static void r5c_disable_writeback_async(struct work_struct *work) 615 + { 616 + struct r5l_log *log = container_of(work, struct r5l_log, 617 + disable_writeback_work); 618 + struct mddev *mddev = log->rdev->mddev; 619 + 620 + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 621 + return; 622 + pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 623 + mdname(mddev)); 624 + mddev_suspend(mddev); 625 + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 626 + mddev_resume(mddev); 614 627 } 615 628 616 629 static void r5l_submit_current_io(struct r5l_log *log) ··· 1410 1393 next_checkpoint = r5c_calculate_new_cp(conf); 1411 1394 spin_unlock_irq(&log->io_list_lock); 1412 1395 1413 - BUG_ON(reclaimable < 0); 1414 - 1415 1396 if (reclaimable == 0 || !write_super) 1416 1397 return; 1417 1398 ··· 2077 2062 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2078 2063 struct r5l_recovery_ctx *ctx) 2079 2064 { 2080 - struct stripe_head *sh, *next; 2065 + struct stripe_head *sh; 2081 2066 struct mddev *mddev = log->rdev->mddev; 2082 2067 struct page *page; 2083 2068 sector_t next_checkpoint = MaxSector; ··· 2091 2076 2092 2077 WARN_ON(list_empty(&ctx->cached_list)); 2093 2078 2094 - list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2079 + list_for_each_entry(sh, &ctx->cached_list, lru) { 2095 2080 struct r5l_meta_block *mb; 2096 2081 int i; 2097 2082 int offset; ··· 2141 2126 ctx->pos = write_pos; 2142 2127 ctx->seq += 1; 2143 2128 next_checkpoint = sh->log_start; 2144 - list_del_init(&sh->lru); 2145 - raid5_release_stripe(sh); 2146 2129 } 2147 2130 log->next_checkpoint = next_checkpoint; 2148 2131 __free_page(page); 2149 2132 return 0; 2133 + } 2134 + 2135 + static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 2136 + struct r5l_recovery_ctx *ctx) 2137 + { 2138 + struct mddev *mddev = log->rdev->mddev; 2139 + struct r5conf *conf = mddev->private; 2140 + struct stripe_head *sh, *next; 2141 + 2142 + if (ctx->data_only_stripes == 0) 2143 + return; 2144 + 2145 + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 2146 + 2147 + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2148 + r5c_make_stripe_write_out(sh); 2149 + set_bit(STRIPE_HANDLE, &sh->state); 2150 + list_del_init(&sh->lru); 2151 + raid5_release_stripe(sh); 2152 + } 2153 + 2154 + md_wakeup_thread(conf->mddev->thread); 2155 + /* reuse conf->wait_for_quiescent in recovery */ 2156 + wait_event(conf->wait_for_quiescent, 2157 + atomic_read(&conf->active_stripes) == 0); 2158 + 2159 + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2150 2160 } 2151 2161 2152 2162 static int r5l_recovery_log(struct r5l_log *log) ··· 2200 2160 pos = ctx.pos; 2201 2161 ctx.seq += 10000; 2202 2162 2203 - if (ctx.data_only_stripes == 0) { 2204 - log->next_checkpoint = ctx.pos; 2205 - r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2206 - ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2207 - } 2208 2163 2209 2164 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2210 2165 pr_debug("md/raid:%s: starting from clean shutdown\n", 2211 2166 mdname(mddev)); 2212 - else { 2167 + else 2213 2168 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2214 2169 mdname(mddev), ctx.data_only_stripes, 2215 2170 ctx.data_parity_stripes); 2216 2171 2217 - if (ctx.data_only_stripes > 0) 2218 - if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2219 - pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2220 - mdname(mddev)); 2221 - return -EIO; 2222 - } 2172 + if (ctx.data_only_stripes == 0) { 2173 + log->next_checkpoint = ctx.pos; 2174 + r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2175 + ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2176 + } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2177 + pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2178 + mdname(mddev)); 2179 + return -EIO; 2223 2180 } 2224 2181 2225 2182 log->log_start = ctx.pos; 2226 2183 log->seq = ctx.seq; 2227 2184 log->last_checkpoint = pos; 2228 2185 r5l_write_super(log, pos); 2186 + 2187 + r5c_recovery_flush_data_only_stripes(log, &ctx); 2229 2188 return 0; 2230 2189 } 2231 2190 ··· 2286 2247 val > R5C_JOURNAL_MODE_WRITE_BACK) 2287 2248 return -EINVAL; 2288 2249 2250 + if (raid5_calc_degraded(conf) > 0 && 2251 + val == R5C_JOURNAL_MODE_WRITE_BACK) 2252 + return -EINVAL; 2253 + 2289 2254 mddev_suspend(mddev); 2290 2255 conf->log->r5c_journal_mode = val; 2291 2256 mddev_resume(mddev); ··· 2344 2301 set_bit(STRIPE_R5C_CACHING, &sh->state); 2345 2302 } 2346 2303 2304 + /* 2305 + * When run in degraded mode, array is set to write-through mode. 2306 + * This check helps drain pending write safely in the transition to 2307 + * write-through mode. 2308 + */ 2309 + if (s->failed) { 2310 + r5c_make_stripe_write_out(sh); 2311 + return -EAGAIN; 2312 + } 2313 + 2347 2314 for (i = disks; i--; ) { 2348 2315 dev = &sh->dev[i]; 2349 2316 /* if non-overwrite, use writing-out phase */ ··· 2404 2351 struct page *p = sh->dev[i].orig_page; 2405 2352 2406 2353 sh->dev[i].orig_page = sh->dev[i].page; 2354 + clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2355 + 2407 2356 if (!using_disk_info_extra_page) 2408 2357 put_page(p); 2409 2358 } ··· 2610 2555 return ret; 2611 2556 } 2612 2557 2558 + void r5c_update_on_rdev_error(struct mddev *mddev) 2559 + { 2560 + struct r5conf *conf = mddev->private; 2561 + struct r5l_log *log = conf->log; 2562 + 2563 + if (!log) 2564 + return; 2565 + 2566 + if (raid5_calc_degraded(conf) > 0 && 2567 + conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 2568 + schedule_work(&log->disable_writeback_work); 2569 + } 2570 + 2613 2571 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2614 2572 { 2615 2573 struct request_queue *q = bdev_get_queue(rdev->bdev); ··· 2695 2627 spin_lock_init(&log->no_space_stripes_lock); 2696 2628 2697 2629 INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 2630 + INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); 2698 2631 2699 2632 log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2700 2633 INIT_LIST_HEAD(&log->stripe_in_journal_list); ··· 2728 2659 2729 2660 void r5l_exit_log(struct r5l_log *log) 2730 2661 { 2662 + flush_work(&log->disable_writeback_work); 2731 2663 md_unregister_thread(&log->reclaim_thread); 2732 2664 mempool_destroy(log->meta_pool); 2733 2665 bioset_free(log->bs);

+94 -27

drivers/md/raid5.c

··· 556 556 * of the two sections, and some non-in_sync devices may 557 557 * be insync in the section most affected by failed devices. 558 558 */ 559 - static int calc_degraded(struct r5conf *conf) 559 + int raid5_calc_degraded(struct r5conf *conf) 560 560 { 561 561 int degraded, degraded2; 562 562 int i; ··· 619 619 if (conf->mddev->reshape_position == MaxSector) 620 620 return conf->mddev->degraded > conf->max_degraded; 621 621 622 - degraded = calc_degraded(conf); 622 + degraded = raid5_calc_degraded(conf); 623 623 if (degraded > conf->max_degraded) 624 624 return 1; 625 625 return 0; ··· 1015 1015 1016 1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1017 1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1018 - sh->dev[i].vec.bv_page = sh->dev[i].page; 1018 + 1019 + if (!op_is_write(op) && 1020 + test_bit(R5_InJournal, &sh->dev[i].flags)) 1021 + /* 1022 + * issuing read for a page in journal, this 1023 + * must be preparing for prexor in rmw; read 1024 + * the data into orig_page 1025 + */ 1026 + sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1027 + else 1028 + sh->dev[i].vec.bv_page = sh->dev[i].page; 1019 1029 bi->bi_vcnt = 1; 1020 1030 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1021 1031 bi->bi_io_vec[0].bv_offset = 0; ··· 2390 2380 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2391 2381 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2392 2382 2383 + if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2384 + /* 2385 + * end read for a page in journal, this 2386 + * must be preparing for prexor in rmw 2387 + */ 2388 + set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2389 + 2393 2390 if (atomic_read(&rdev->read_errors)) 2394 2391 atomic_set(&rdev->read_errors, 0); 2395 2392 } else { ··· 2555 2538 2556 2539 spin_lock_irqsave(&conf->device_lock, flags); 2557 2540 clear_bit(In_sync, &rdev->flags); 2558 - mddev->degraded = calc_degraded(conf); 2541 + mddev->degraded = raid5_calc_degraded(conf); 2559 2542 spin_unlock_irqrestore(&conf->device_lock, flags); 2560 2543 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2561 2544 ··· 2569 2552 bdevname(rdev->bdev, b), 2570 2553 mdname(mddev), 2571 2554 conf->raid_disks - mddev->degraded); 2555 + r5c_update_on_rdev_error(mddev); 2572 2556 } 2573 2557 2574 2558 /* ··· 2898 2880 return r_sector; 2899 2881 } 2900 2882 2883 + /* 2884 + * There are cases where we want handle_stripe_dirtying() and 2885 + * schedule_reconstruction() to delay towrite to some dev of a stripe. 2886 + * 2887 + * This function checks whether we want to delay the towrite. Specifically, 2888 + * we delay the towrite when: 2889 + * 2890 + * 1. degraded stripe has a non-overwrite to the missing dev, AND this 2891 + * stripe has data in journal (for other devices). 2892 + * 2893 + * In this case, when reading data for the non-overwrite dev, it is 2894 + * necessary to handle complex rmw of write back cache (prexor with 2895 + * orig_page, and xor with page). To keep read path simple, we would 2896 + * like to flush data in journal to RAID disks first, so complex rmw 2897 + * is handled in the write patch (handle_stripe_dirtying). 2898 + * 2899 + */ 2900 + static inline bool delay_towrite(struct r5dev *dev, 2901 + struct stripe_head_state *s) 2902 + { 2903 + return !test_bit(R5_OVERWRITE, &dev->flags) && 2904 + !test_bit(R5_Insync, &dev->flags) && s->injournal; 2905 + } 2906 + 2901 2907 static void 2902 2908 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2903 2909 int rcw, int expand) ··· 2942 2900 for (i = disks; i--; ) { 2943 2901 struct r5dev *dev = &sh->dev[i]; 2944 2902 2945 - if (dev->towrite) { 2903 + if (dev->towrite && !delay_towrite(dev, s)) { 2946 2904 set_bit(R5_LOCKED, &dev->flags); 2947 2905 set_bit(R5_Wantdrain, &dev->flags); 2948 2906 if (!expand) ··· 3337 3295 return rv; 3338 3296 } 3339 3297 3340 - /* fetch_block - checks the given member device to see if its data needs 3341 - * to be read or computed to satisfy a request. 3342 - * 3343 - * Returns 1 when no more member devices need to be checked, otherwise returns 3344 - * 0 to tell the loop in handle_stripe_fill to continue 3345 - */ 3346 - 3347 3298 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3348 3299 int disk_idx, int disks) 3349 3300 { ··· 3427 3392 return 0; 3428 3393 } 3429 3394 3395 + /* fetch_block - checks the given member device to see if its data needs 3396 + * to be read or computed to satisfy a request. 3397 + * 3398 + * Returns 1 when no more member devices need to be checked, otherwise returns 3399 + * 0 to tell the loop in handle_stripe_fill to continue 3400 + */ 3430 3401 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3431 3402 int disk_idx, int disks) 3432 3403 { ··· 3519 3478 * midst of changing due to a write 3520 3479 */ 3521 3480 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3522 - !sh->reconstruct_state) 3481 + !sh->reconstruct_state) { 3482 + 3483 + /* 3484 + * For degraded stripe with data in journal, do not handle 3485 + * read requests yet, instead, flush the stripe to raid 3486 + * disks first, this avoids handling complex rmw of write 3487 + * back cache (prexor with orig_page, and then xor with 3488 + * page) in the read path 3489 + */ 3490 + if (s->injournal && s->failed) { 3491 + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3492 + r5c_make_stripe_write_out(sh); 3493 + goto out; 3494 + } 3495 + 3523 3496 for (i = disks; i--; ) 3524 3497 if (fetch_block(sh, s, i, disks)) 3525 3498 break; 3499 + } 3500 + out: 3526 3501 set_bit(STRIPE_HANDLE, &sh->state); 3527 3502 } 3528 3503 ··· 3651 3594 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3652 3595 } 3653 3596 3597 + /* 3598 + * For RMW in write back cache, we need extra page in prexor to store the 3599 + * old data. This page is stored in dev->orig_page. 3600 + * 3601 + * This function checks whether we have data for prexor. The exact logic 3602 + * is: 3603 + * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3604 + */ 3605 + static inline bool uptodate_for_rmw(struct r5dev *dev) 3606 + { 3607 + return (test_bit(R5_UPTODATE, &dev->flags)) && 3608 + (!test_bit(R5_InJournal, &dev->flags) || 3609 + test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3610 + } 3611 + 3654 3612 static int handle_stripe_dirtying(struct r5conf *conf, 3655 3613 struct stripe_head *sh, 3656 3614 struct stripe_head_state *s, ··· 3694 3622 } else for (i = disks; i--; ) { 3695 3623 /* would I have to read this buffer for read_modify_write */ 3696 3624 struct r5dev *dev = &sh->dev[i]; 3697 - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || 3625 + if (((dev->towrite && !delay_towrite(dev, s)) || 3626 + i == sh->pd_idx || i == sh->qd_idx || 3698 3627 test_bit(R5_InJournal, &dev->flags)) && 3699 3628 !test_bit(R5_LOCKED, &dev->flags) && 3700 - !((test_bit(R5_UPTODATE, &dev->flags) && 3701 - (!test_bit(R5_InJournal, &dev->flags) || 3702 - dev->page != dev->orig_page)) || 3629 + !(uptodate_for_rmw(dev) || 3703 3630 test_bit(R5_Wantcompute, &dev->flags))) { 3704 3631 if (test_bit(R5_Insync, &dev->flags)) 3705 3632 rmw++; ··· 3710 3639 i != sh->pd_idx && i != sh->qd_idx && 3711 3640 !test_bit(R5_LOCKED, &dev->flags) && 3712 3641 !(test_bit(R5_UPTODATE, &dev->flags) || 3713 - test_bit(R5_InJournal, &dev->flags) || 3714 3642 test_bit(R5_Wantcompute, &dev->flags))) { 3715 3643 if (test_bit(R5_Insync, &dev->flags)) 3716 3644 rcw++; ··· 3759 3689 3760 3690 for (i = disks; i--; ) { 3761 3691 struct r5dev *dev = &sh->dev[i]; 3762 - if ((dev->towrite || 3692 + if (((dev->towrite && !delay_towrite(dev, s)) || 3763 3693 i == sh->pd_idx || i == sh->qd_idx || 3764 3694 test_bit(R5_InJournal, &dev->flags)) && 3765 3695 !test_bit(R5_LOCKED, &dev->flags) && 3766 - !((test_bit(R5_UPTODATE, &dev->flags) && 3767 - (!test_bit(R5_InJournal, &dev->flags) || 3768 - dev->page != dev->orig_page)) || 3696 + !(uptodate_for_rmw(dev) || 3769 3697 test_bit(R5_Wantcompute, &dev->flags)) && 3770 3698 test_bit(R5_Insync, &dev->flags)) { 3771 3699 if (test_bit(STRIPE_PREREAD_ACTIVE, ··· 3790 3722 i != sh->pd_idx && i != sh->qd_idx && 3791 3723 !test_bit(R5_LOCKED, &dev->flags) && 3792 3724 !(test_bit(R5_UPTODATE, &dev->flags) || 3793 - test_bit(R5_InJournal, &dev->flags) || 3794 3725 test_bit(R5_Wantcompute, &dev->flags))) { 3795 3726 rcw++; 3796 3727 if (test_bit(R5_Insync, &dev->flags) && ··· 7092 7025 /* 7093 7026 * 0 for a fully functional array, 1 or 2 for a degraded array. 7094 7027 */ 7095 - mddev->degraded = calc_degraded(conf); 7028 + mddev->degraded = raid5_calc_degraded(conf); 7096 7029 7097 7030 if (has_failed(conf)) { 7098 7031 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", ··· 7339 7272 } 7340 7273 } 7341 7274 spin_lock_irqsave(&conf->device_lock, flags); 7342 - mddev->degraded = calc_degraded(conf); 7275 + mddev->degraded = raid5_calc_degraded(conf); 7343 7276 spin_unlock_irqrestore(&conf->device_lock, flags); 7344 7277 print_raid5_conf(conf); 7345 7278 return count; ··· 7699 7632 * pre and post number of devices. 7700 7633 */ 7701 7634 spin_lock_irqsave(&conf->device_lock, flags); 7702 - mddev->degraded = calc_degraded(conf); 7635 + mddev->degraded = raid5_calc_degraded(conf); 7703 7636 spin_unlock_irqrestore(&conf->device_lock, flags); 7704 7637 } 7705 7638 mddev->raid_disks = conf->raid_disks; ··· 7787 7720 } else { 7788 7721 int d; 7789 7722 spin_lock_irq(&conf->device_lock); 7790 - mddev->degraded = calc_degraded(conf); 7723 + mddev->degraded = raid5_calc_degraded(conf); 7791 7724 spin_unlock_irq(&conf->device_lock); 7792 7725 for (d = conf->raid_disks ; 7793 7726 d < conf->raid_disks - mddev->delta_disks;

+7

drivers/md/raid5.h

··· 322 322 * data and parity being written are in the journal 323 323 * device 324 324 */ 325 + R5_OrigPageUPTDODATE, /* with write back cache, we read old data into 326 + * dev->orig_page for prexor. When this flag is 327 + * set, orig_page contains latest data in the 328 + * raid disk. 329 + */ 325 330 }; 326 331 327 332 /* ··· 758 753 extern struct stripe_head * 759 754 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 760 755 int previous, int noblock, int noquiesce); 756 + extern int raid5_calc_degraded(struct r5conf *conf); 761 757 extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); 762 758 extern void r5l_exit_log(struct r5l_log *log); 763 759 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); ··· 787 781 extern void r5c_check_stripe_cache_usage(struct r5conf *conf); 788 782 extern void r5c_check_cached_full_stripe(struct r5conf *conf); 789 783 extern struct md_sysfs_entry r5c_journal_mode; 784 + extern void r5c_update_on_rdev_error(struct mddev *mddev); 790 785 #endif

Configure Feed

Configure Feed