Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md

Pull m,ore md bugfixes gfrom Neil Brown:
"Assorted fixes for new RAID5 stripe-batching functionality.

Unfortunately this functionality was merged a little prematurely. The
necessary testing and code review is now complete (or as complete as
it can be) and to code passes a variety of tests and looks quite
sensible.

Also a fix for some recent locking changes - a race was introduced
which causes a reshape request to sometimes fail. No data safety
issues"

* tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md:
md: fix race when unfreezing sync_action
md/raid5: break stripe-batches when the array has failed.
md/raid5: call break_stripe_batch_list from handle_stripe_clean_event
md/raid5: be more selective about distributing flags across batch.
md/raid5: add handle_flags arg to break_stripe_batch_list.
md/raid5: duplicate some more handle_stripe_clean_event code in break_stripe_batch_list
md/raid5: remove condition test from check_break_stripe_batch_list.
md/raid5: Ensure a batch member is not handled prematurely.
md/raid5: close race between STRIPE_BIT_DELAY and batching.
md/raid5: ensure whole batch is delayed for all required bitmap updates.

+98 -67
+8 -6
drivers/md/md.c
··· 4211 4211 if (!mddev->pers || !mddev->pers->sync_request) 4212 4212 return -EINVAL; 4213 4213 4214 - if (cmd_match(page, "frozen")) 4215 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4216 - else 4217 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4218 4214 4219 4215 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4216 + if (cmd_match(page, "frozen")) 4217 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4218 + else 4219 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4220 4220 flush_workqueue(md_misc_wq); 4221 4221 if (mddev->sync_thread) { 4222 4222 set_bit(MD_RECOVERY_INTR, &mddev->recovery); ··· 4229 4229 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4230 4230 return -EBUSY; 4231 4231 else if (cmd_match(page, "resync")) 4232 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4232 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4233 4233 else if (cmd_match(page, "recover")) { 4234 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4234 4235 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4235 - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4236 4236 } else if (cmd_match(page, "reshape")) { 4237 4237 int err; 4238 4238 if (mddev->pers->start_reshape == NULL) 4239 4239 return -EINVAL; 4240 4240 err = mddev_lock(mddev); 4241 4241 if (!err) { 4242 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4242 4243 err = mddev->pers->start_reshape(mddev); 4243 4244 mddev_unlock(mddev); 4244 4245 } ··· 4251 4250 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4252 4251 else if (!cmd_match(page, "repair")) 4253 4252 return -EINVAL; 4253 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4254 4254 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4255 4255 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4256 4256 }
+86 -60
drivers/md/raid5.c
··· 749 749 static bool stripe_can_batch(struct stripe_head *sh) 750 750 { 751 751 return test_bit(STRIPE_BATCH_READY, &sh->state) && 752 + !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 752 753 is_full_stripe_write(sh); 753 754 } 754 755 ··· 837 836 if (atomic_dec_return(&conf->preread_active_stripes) 838 837 < IO_THRESHOLD) 839 838 md_wakeup_thread(conf->mddev->thread); 839 + 840 + if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 841 + int seq = sh->bm_seq; 842 + if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 843 + sh->batch_head->bm_seq > seq) 844 + seq = sh->batch_head->bm_seq; 845 + set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 846 + sh->batch_head->bm_seq = seq; 847 + } 840 848 841 849 atomic_inc(&sh->count); 842 850 unlock_out: ··· 2997 2987 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2998 2988 (unsigned long long)(*bip)->bi_iter.bi_sector, 2999 2989 (unsigned long long)sh->sector, dd_idx); 3000 - spin_unlock_irq(&sh->stripe_lock); 3001 2990 3002 2991 if (conf->mddev->bitmap && firstwrite) { 2992 + /* Cannot hold spinlock over bitmap_startwrite, 2993 + * but must ensure this isn't added to a batch until 2994 + * we have added to the bitmap and set bm_seq. 2995 + * So set STRIPE_BITMAP_PENDING to prevent 2996 + * batching. 2997 + * If multiple add_stripe_bio() calls race here they 2998 + * much all set STRIPE_BITMAP_PENDING. So only the first one 2999 + * to complete "bitmap_startwrite" gets to set 3000 + * STRIPE_BIT_DELAY. This is important as once a stripe 3001 + * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3002 + * any more. 3003 + */ 3004 + set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3005 + spin_unlock_irq(&sh->stripe_lock); 3003 3006 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3004 3007 STRIPE_SECTORS, 0); 3005 - sh->bm_seq = conf->seq_flush+1; 3006 - set_bit(STRIPE_BIT_DELAY, &sh->state); 3008 + spin_lock_irq(&sh->stripe_lock); 3009 + clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3010 + if (!sh->batch_head) { 3011 + sh->bm_seq = conf->seq_flush+1; 3012 + set_bit(STRIPE_BIT_DELAY, &sh->state); 3013 + } 3007 3014 } 3015 + spin_unlock_irq(&sh->stripe_lock); 3008 3016 3009 3017 if (stripe_can_batch(sh)) 3010 3018 stripe_add_to_batch_list(conf, sh); ··· 3420 3392 set_bit(STRIPE_HANDLE, &sh->state); 3421 3393 } 3422 3394 3395 + static void break_stripe_batch_list(struct stripe_head *head_sh, 3396 + unsigned long handle_flags); 3423 3397 /* handle_stripe_clean_event 3424 3398 * any written block on an uptodate or failed drive can be returned. 3425 3399 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but ··· 3435 3405 int discard_pending = 0; 3436 3406 struct stripe_head *head_sh = sh; 3437 3407 bool do_endio = false; 3438 - int wakeup_nr = 0; 3439 3408 3440 3409 for (i = disks; i--; ) 3441 3410 if (sh->dev[i].written) { ··· 3523 3494 if (atomic_dec_and_test(&conf->pending_full_writes)) 3524 3495 md_wakeup_thread(conf->mddev->thread); 3525 3496 3526 - if (!head_sh->batch_head || !do_endio) 3527 - return; 3528 - for (i = 0; i < head_sh->disks; i++) { 3529 - if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 3530 - wakeup_nr++; 3531 - } 3532 - while (!list_empty(&head_sh->batch_list)) { 3533 - int i; 3534 - sh = list_first_entry(&head_sh->batch_list, 3535 - struct stripe_head, batch_list); 3536 - list_del_init(&sh->batch_list); 3537 - 3538 - set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, 3539 - head_sh->state & ~((1 << STRIPE_ACTIVE) | 3540 - (1 << STRIPE_PREREAD_ACTIVE) | 3541 - STRIPE_EXPAND_SYNC_FLAG)); 3542 - sh->check_state = head_sh->check_state; 3543 - sh->reconstruct_state = head_sh->reconstruct_state; 3544 - for (i = 0; i < sh->disks; i++) { 3545 - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3546 - wakeup_nr++; 3547 - sh->dev[i].flags = head_sh->dev[i].flags; 3548 - } 3549 - 3550 - spin_lock_irq(&sh->stripe_lock); 3551 - sh->batch_head = NULL; 3552 - spin_unlock_irq(&sh->stripe_lock); 3553 - if (sh->state & STRIPE_EXPAND_SYNC_FLAG) 3554 - set_bit(STRIPE_HANDLE, &sh->state); 3555 - release_stripe(sh); 3556 - } 3557 - 3558 - spin_lock_irq(&head_sh->stripe_lock); 3559 - head_sh->batch_head = NULL; 3560 - spin_unlock_irq(&head_sh->stripe_lock); 3561 - wake_up_nr(&conf->wait_for_overlap, wakeup_nr); 3562 - if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) 3563 - set_bit(STRIPE_HANDLE, &head_sh->state); 3497 + if (head_sh->batch_head && do_endio) 3498 + break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3564 3499 } 3565 3500 3566 3501 static void handle_stripe_dirtying(struct r5conf *conf, ··· 4165 4172 4166 4173 static int clear_batch_ready(struct stripe_head *sh) 4167 4174 { 4175 + /* Return '1' if this is a member of batch, or 4176 + * '0' if it is a lone stripe or a head which can now be 4177 + * handled. 4178 + */ 4168 4179 struct stripe_head *tmp; 4169 4180 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4170 - return 0; 4181 + return (sh->batch_head && sh->batch_head != sh); 4171 4182 spin_lock(&sh->stripe_lock); 4172 4183 if (!sh->batch_head) { 4173 4184 spin_unlock(&sh->stripe_lock); ··· 4199 4202 return 0; 4200 4203 } 4201 4204 4202 - static void check_break_stripe_batch_list(struct stripe_head *sh) 4205 + static void break_stripe_batch_list(struct stripe_head *head_sh, 4206 + unsigned long handle_flags) 4203 4207 { 4204 - struct stripe_head *head_sh, *next; 4208 + struct stripe_head *sh, *next; 4205 4209 int i; 4206 - 4207 - if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4208 - return; 4209 - 4210 - head_sh = sh; 4210 + int do_wakeup = 0; 4211 4211 4212 4212 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4213 4213 4214 4214 list_del_init(&sh->batch_list); 4215 4215 4216 - set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, 4217 - head_sh->state & ~((1 << STRIPE_ACTIVE) | 4218 - (1 << STRIPE_PREREAD_ACTIVE) | 4219 - (1 << STRIPE_DEGRADED) | 4220 - STRIPE_EXPAND_SYNC_FLAG)); 4216 + WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4217 + (1 << STRIPE_SYNCING) | 4218 + (1 << STRIPE_REPLACED) | 4219 + (1 << STRIPE_PREREAD_ACTIVE) | 4220 + (1 << STRIPE_DELAYED) | 4221 + (1 << STRIPE_BIT_DELAY) | 4222 + (1 << STRIPE_FULL_WRITE) | 4223 + (1 << STRIPE_BIOFILL_RUN) | 4224 + (1 << STRIPE_COMPUTE_RUN) | 4225 + (1 << STRIPE_OPS_REQ_PENDING) | 4226 + (1 << STRIPE_DISCARD) | 4227 + (1 << STRIPE_BATCH_READY) | 4228 + (1 << STRIPE_BATCH_ERR) | 4229 + (1 << STRIPE_BITMAP_PENDING))); 4230 + WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4231 + (1 << STRIPE_REPLACED))); 4232 + 4233 + set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4234 + (1 << STRIPE_DEGRADED)), 4235 + head_sh->state & (1 << STRIPE_INSYNC)); 4236 + 4221 4237 sh->check_state = head_sh->check_state; 4222 4238 sh->reconstruct_state = head_sh->reconstruct_state; 4223 - for (i = 0; i < sh->disks; i++) 4239 + for (i = 0; i < sh->disks; i++) { 4240 + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4241 + do_wakeup = 1; 4224 4242 sh->dev[i].flags = head_sh->dev[i].flags & 4225 4243 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4226 - 4244 + } 4227 4245 spin_lock_irq(&sh->stripe_lock); 4228 4246 sh->batch_head = NULL; 4229 4247 spin_unlock_irq(&sh->stripe_lock); 4230 - 4231 - set_bit(STRIPE_HANDLE, &sh->state); 4248 + if (handle_flags == 0 || 4249 + sh->state & handle_flags) 4250 + set_bit(STRIPE_HANDLE, &sh->state); 4232 4251 release_stripe(sh); 4233 4252 } 4253 + spin_lock_irq(&head_sh->stripe_lock); 4254 + head_sh->batch_head = NULL; 4255 + spin_unlock_irq(&head_sh->stripe_lock); 4256 + for (i = 0; i < head_sh->disks; i++) 4257 + if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4258 + do_wakeup = 1; 4259 + if (head_sh->state & handle_flags) 4260 + set_bit(STRIPE_HANDLE, &head_sh->state); 4261 + 4262 + if (do_wakeup) 4263 + wake_up(&head_sh->raid_conf->wait_for_overlap); 4234 4264 } 4235 4265 4236 4266 static void handle_stripe(struct stripe_head *sh) ··· 4282 4258 return; 4283 4259 } 4284 4260 4285 - check_break_stripe_batch_list(sh); 4261 + if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4262 + break_stripe_batch_list(sh, 0); 4286 4263 4287 4264 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4288 4265 spin_lock(&sh->stripe_lock); ··· 4337 4312 if (s.failed > conf->max_degraded) { 4338 4313 sh->check_state = 0; 4339 4314 sh->reconstruct_state = 0; 4315 + break_stripe_batch_list(sh, 0); 4340 4316 if (s.to_read+s.to_write+s.written) 4341 4317 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4342 4318 if (s.syncing + s.replacing)
+4 -1
drivers/md/raid5.h
··· 337 337 STRIPE_ON_RELEASE_LIST, 338 338 STRIPE_BATCH_READY, 339 339 STRIPE_BATCH_ERR, 340 + STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add 341 + * to batch yet. 342 + */ 340 343 }; 341 344 342 - #define STRIPE_EXPAND_SYNC_FLAG \ 345 + #define STRIPE_EXPAND_SYNC_FLAGS \ 343 346 ((1 << STRIPE_EXPAND_SOURCE) |\ 344 347 (1 << STRIPE_EXPAND_READY) |\ 345 348 (1 << STRIPE_EXPANDING) |\