Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'md-3.7-fixes' of git://neil.brown.name/md

Pull md fixes from NeilBrown:
"Several bug fixes for md in 3.7:

- raid5 discard has problems
- raid10 replacement devices have problems
- bad block lock seqlock usage has problems
- dm-raid doesn't free everything"

* tag 'md-3.7-fixes' of git://neil.brown.name/md:
md/raid10: decrement correct pending counter when writing to replacement.
md/raid10: close race that lose writes lost when replacement completes.
md/raid5: Make sure we clear R5_Discard when discard is finished.
md/raid5: move resolving of reconstruct_state earlier in stripe_handle.
md/raid5: round discard alignment up to power of 2.
md: make sure everything is freed when dm-raid stops an array.
md: Avoid write invalid address if read_seqretry returned true.
md: Reassigned the parameters if read_seqretry returned true in func md_is_badblock.

+132 -105
+20 -7
drivers/md/md.c
··· 1817 1817 memset(bbp, 0xff, PAGE_SIZE); 1818 1818 1819 1819 for (i = 0 ; i < bb->count ; i++) { 1820 - u64 internal_bb = *p++; 1820 + u64 internal_bb = p[i]; 1821 1821 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1822 1822 | BB_LEN(internal_bb)); 1823 - *bbp++ = cpu_to_le64(store_bb); 1823 + bbp[i] = cpu_to_le64(store_bb); 1824 1824 } 1825 1825 bb->changed = 0; 1826 1826 if (read_seqretry(&bb->lock, seq)) ··· 5294 5294 } 5295 5295 EXPORT_SYMBOL_GPL(md_stop_writes); 5296 5296 5297 - void md_stop(struct mddev *mddev) 5297 + static void __md_stop(struct mddev *mddev) 5298 5298 { 5299 5299 mddev->ready = 0; 5300 5300 mddev->pers->stop(mddev); ··· 5304 5304 mddev->pers = NULL; 5305 5305 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5306 5306 } 5307 + 5308 + void md_stop(struct mddev *mddev) 5309 + { 5310 + /* stop the array and free an attached data structures. 5311 + * This is called from dm-raid 5312 + */ 5313 + __md_stop(mddev); 5314 + bitmap_destroy(mddev); 5315 + if (mddev->bio_set) 5316 + bioset_free(mddev->bio_set); 5317 + } 5318 + 5307 5319 EXPORT_SYMBOL_GPL(md_stop); 5308 5320 5309 5321 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) ··· 5376 5364 set_disk_ro(disk, 0); 5377 5365 5378 5366 __md_stop_writes(mddev); 5379 - md_stop(mddev); 5367 + __md_stop(mddev); 5380 5368 mddev->queue->merge_bvec_fn = NULL; 5381 5369 mddev->queue->backing_dev_info.congested_fn = NULL; 5382 5370 ··· 7948 7936 sector_t *first_bad, int *bad_sectors) 7949 7937 { 7950 7938 int hi; 7951 - int lo = 0; 7939 + int lo; 7952 7940 u64 *p = bb->page; 7953 - int rv = 0; 7941 + int rv; 7954 7942 sector_t target = s + sectors; 7955 7943 unsigned seq; 7956 7944 ··· 7965 7953 7966 7954 retry: 7967 7955 seq = read_seqbegin(&bb->lock); 7968 - 7956 + lo = 0; 7957 + rv = 0; 7969 7958 hi = bb->count; 7970 7959 7971 7960 /* Binary search between lo and hi for 'target'
+69 -62
drivers/md/raid10.c
··· 499 499 */ 500 500 one_write_done(r10_bio); 501 501 if (dec_rdev) 502 - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); 502 + rdev_dec_pending(rdev, conf->mddev); 503 503 } 504 504 505 505 /* ··· 1334 1334 blocked_rdev = rrdev; 1335 1335 break; 1336 1336 } 1337 + if (rdev && (test_bit(Faulty, &rdev->flags) 1338 + || test_bit(Unmerged, &rdev->flags))) 1339 + rdev = NULL; 1337 1340 if (rrdev && (test_bit(Faulty, &rrdev->flags) 1338 1341 || test_bit(Unmerged, &rrdev->flags))) 1339 1342 rrdev = NULL; 1340 1343 1341 1344 r10_bio->devs[i].bio = NULL; 1342 1345 r10_bio->devs[i].repl_bio = NULL; 1343 - if (!rdev || test_bit(Faulty, &rdev->flags) || 1344 - test_bit(Unmerged, &rdev->flags)) { 1346 + 1347 + if (!rdev && !rrdev) { 1345 1348 set_bit(R10BIO_Degraded, &r10_bio->state); 1346 1349 continue; 1347 1350 } 1348 - if (test_bit(WriteErrorSeen, &rdev->flags)) { 1351 + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1349 1352 sector_t first_bad; 1350 1353 sector_t dev_sector = r10_bio->devs[i].addr; 1351 1354 int bad_sectors; ··· 1390 1387 max_sectors = good_sectors; 1391 1388 } 1392 1389 } 1393 - r10_bio->devs[i].bio = bio; 1394 - atomic_inc(&rdev->nr_pending); 1390 + if (rdev) { 1391 + r10_bio->devs[i].bio = bio; 1392 + atomic_inc(&rdev->nr_pending); 1393 + } 1395 1394 if (rrdev) { 1396 1395 r10_bio->devs[i].repl_bio = bio; 1397 1396 atomic_inc(&rrdev->nr_pending); ··· 1449 1444 for (i = 0; i < conf->copies; i++) { 1450 1445 struct bio *mbio; 1451 1446 int d = r10_bio->devs[i].devnum; 1452 - if (!r10_bio->devs[i].bio) 1453 - continue; 1447 + if (r10_bio->devs[i].bio) { 1448 + struct md_rdev *rdev = conf->mirrors[d].rdev; 1449 + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1450 + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1451 + max_sectors); 1452 + r10_bio->devs[i].bio = mbio; 1454 1453 1455 - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1456 - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1457 - max_sectors); 1458 - r10_bio->devs[i].bio = mbio; 1454 + mbio->bi_sector = (r10_bio->devs[i].addr+ 1455 + choose_data_offset(r10_bio, 1456 + rdev)); 1457 + mbio->bi_bdev = rdev->bdev; 1458 + mbio->bi_end_io = raid10_end_write_request; 1459 + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1460 + mbio->bi_private = r10_bio; 1459 1461 1460 - mbio->bi_sector = (r10_bio->devs[i].addr+ 1461 - choose_data_offset(r10_bio, 1462 - conf->mirrors[d].rdev)); 1463 - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1464 - mbio->bi_end_io = raid10_end_write_request; 1465 - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1466 - mbio->bi_private = r10_bio; 1462 + atomic_inc(&r10_bio->remaining); 1467 1463 1468 - atomic_inc(&r10_bio->remaining); 1464 + cb = blk_check_plugged(raid10_unplug, mddev, 1465 + sizeof(*plug)); 1466 + if (cb) 1467 + plug = container_of(cb, struct raid10_plug_cb, 1468 + cb); 1469 + else 1470 + plug = NULL; 1471 + spin_lock_irqsave(&conf->device_lock, flags); 1472 + if (plug) { 1473 + bio_list_add(&plug->pending, mbio); 1474 + plug->pending_cnt++; 1475 + } else { 1476 + bio_list_add(&conf->pending_bio_list, mbio); 1477 + conf->pending_count++; 1478 + } 1479 + spin_unlock_irqrestore(&conf->device_lock, flags); 1480 + if (!plug) 1481 + md_wakeup_thread(mddev->thread); 1482 + } 1469 1483 1470 - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); 1471 - if (cb) 1472 - plug = container_of(cb, struct raid10_plug_cb, cb); 1473 - else 1474 - plug = NULL; 1475 - spin_lock_irqsave(&conf->device_lock, flags); 1476 - if (plug) { 1477 - bio_list_add(&plug->pending, mbio); 1478 - plug->pending_cnt++; 1479 - } else { 1484 + if (r10_bio->devs[i].repl_bio) { 1485 + struct md_rdev *rdev = conf->mirrors[d].replacement; 1486 + if (rdev == NULL) { 1487 + /* Replacement just got moved to main 'rdev' */ 1488 + smp_mb(); 1489 + rdev = conf->mirrors[d].rdev; 1490 + } 1491 + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1492 + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1493 + max_sectors); 1494 + r10_bio->devs[i].repl_bio = mbio; 1495 + 1496 + mbio->bi_sector = (r10_bio->devs[i].addr + 1497 + choose_data_offset( 1498 + r10_bio, rdev)); 1499 + mbio->bi_bdev = rdev->bdev; 1500 + mbio->bi_end_io = raid10_end_write_request; 1501 + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1502 + mbio->bi_private = r10_bio; 1503 + 1504 + atomic_inc(&r10_bio->remaining); 1505 + spin_lock_irqsave(&conf->device_lock, flags); 1480 1506 bio_list_add(&conf->pending_bio_list, mbio); 1481 1507 conf->pending_count++; 1508 + spin_unlock_irqrestore(&conf->device_lock, flags); 1509 + if (!mddev_check_plugged(mddev)) 1510 + md_wakeup_thread(mddev->thread); 1482 1511 } 1483 - spin_unlock_irqrestore(&conf->device_lock, flags); 1484 - if (!plug) 1485 - md_wakeup_thread(mddev->thread); 1486 - 1487 - if (!r10_bio->devs[i].repl_bio) 1488 - continue; 1489 - 1490 - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1491 - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1492 - max_sectors); 1493 - r10_bio->devs[i].repl_bio = mbio; 1494 - 1495 - /* We are actively writing to the original device 1496 - * so it cannot disappear, so the replacement cannot 1497 - * become NULL here 1498 - */ 1499 - mbio->bi_sector = (r10_bio->devs[i].addr + 1500 - choose_data_offset( 1501 - r10_bio, 1502 - conf->mirrors[d].replacement)); 1503 - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; 1504 - mbio->bi_end_io = raid10_end_write_request; 1505 - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1506 - mbio->bi_private = r10_bio; 1507 - 1508 - atomic_inc(&r10_bio->remaining); 1509 - spin_lock_irqsave(&conf->device_lock, flags); 1510 - bio_list_add(&conf->pending_bio_list, mbio); 1511 - conf->pending_count++; 1512 - spin_unlock_irqrestore(&conf->device_lock, flags); 1513 - if (!mddev_check_plugged(mddev)) 1514 - md_wakeup_thread(mddev->thread); 1515 1512 } 1516 1513 1517 1514 /* Don't remove the bias on 'remaining' (one_write_done) until
+43 -36
drivers/md/raid5.c
··· 2774 2774 dev = &sh->dev[i]; 2775 2775 if (!test_bit(R5_LOCKED, &dev->flags) && 2776 2776 (test_bit(R5_UPTODATE, &dev->flags) || 2777 - test_and_clear_bit(R5_Discard, &dev->flags))) { 2777 + test_bit(R5_Discard, &dev->flags))) { 2778 2778 /* We can return any write requests */ 2779 2779 struct bio *wbi, *wbi2; 2780 2780 pr_debug("Return write for disc %d\n", i); 2781 + if (test_and_clear_bit(R5_Discard, &dev->flags)) 2782 + clear_bit(R5_UPTODATE, &dev->flags); 2781 2783 wbi = dev->written; 2782 2784 dev->written = NULL; 2783 2785 while (wbi && wbi->bi_sector < ··· 2797 2795 !test_bit(STRIPE_DEGRADED, &sh->state), 2798 2796 0); 2799 2797 } 2800 - } 2798 + } else if (test_bit(R5_Discard, &sh->dev[i].flags)) 2799 + clear_bit(R5_Discard, &sh->dev[i].flags); 2801 2800 2802 2801 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2803 2802 if (atomic_dec_and_test(&conf->pending_full_writes)) ··· 3493 3490 handle_failed_sync(conf, sh, &s); 3494 3491 } 3495 3492 3496 - /* 3497 - * might be able to return some write requests if the parity blocks 3498 - * are safe, or on a failed drive 3499 - */ 3500 - pdev = &sh->dev[sh->pd_idx]; 3501 - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3502 - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3503 - qdev = &sh->dev[sh->qd_idx]; 3504 - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3505 - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3506 - || conf->level < 6; 3507 - 3508 - if (s.written && 3509 - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3510 - && !test_bit(R5_LOCKED, &pdev->flags) 3511 - && (test_bit(R5_UPTODATE, &pdev->flags) || 3512 - test_bit(R5_Discard, &pdev->flags))))) && 3513 - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3514 - && !test_bit(R5_LOCKED, &qdev->flags) 3515 - && (test_bit(R5_UPTODATE, &qdev->flags) || 3516 - test_bit(R5_Discard, &qdev->flags)))))) 3517 - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3518 - 3519 - /* Now we might consider reading some blocks, either to check/generate 3520 - * parity, or to satisfy requests 3521 - * or to load a block that is being partially written. 3522 - */ 3523 - if (s.to_read || s.non_overwrite 3524 - || (conf->level == 6 && s.to_write && s.failed) 3525 - || (s.syncing && (s.uptodate + s.compute < disks)) 3526 - || s.replacing 3527 - || s.expanding) 3528 - handle_stripe_fill(sh, &s, disks); 3529 - 3530 3493 /* Now we check to see if any write operations have recently 3531 3494 * completed 3532 3495 */ ··· 3529 3560 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3530 3561 s.dec_preread_active = 1; 3531 3562 } 3563 + 3564 + /* 3565 + * might be able to return some write requests if the parity blocks 3566 + * are safe, or on a failed drive 3567 + */ 3568 + pdev = &sh->dev[sh->pd_idx]; 3569 + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 3570 + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 3571 + qdev = &sh->dev[sh->qd_idx]; 3572 + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 3573 + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 3574 + || conf->level < 6; 3575 + 3576 + if (s.written && 3577 + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3578 + && !test_bit(R5_LOCKED, &pdev->flags) 3579 + && (test_bit(R5_UPTODATE, &pdev->flags) || 3580 + test_bit(R5_Discard, &pdev->flags))))) && 3581 + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3582 + && !test_bit(R5_LOCKED, &qdev->flags) 3583 + && (test_bit(R5_UPTODATE, &qdev->flags) || 3584 + test_bit(R5_Discard, &qdev->flags)))))) 3585 + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3586 + 3587 + /* Now we might consider reading some blocks, either to check/generate 3588 + * parity, or to satisfy requests 3589 + * or to load a block that is being partially written. 3590 + */ 3591 + if (s.to_read || s.non_overwrite 3592 + || (conf->level == 6 && s.to_write && s.failed) 3593 + || (s.syncing && (s.uptodate + s.compute < disks)) 3594 + || s.replacing 3595 + || s.expanding) 3596 + handle_stripe_fill(sh, &s, disks); 3532 3597 3533 3598 /* Now to consider new write requests and what else, if anything 3534 3599 * should be read. We do not handle new writes when: ··· 5532 5529 * discard data disk but write parity disk 5533 5530 */ 5534 5531 stripe = stripe * PAGE_SIZE; 5532 + /* Round up to power of 2, as discard handling 5533 + * currently assumes that */ 5534 + while ((stripe-1) & stripe) 5535 + stripe = (stripe | (stripe-1)) + 1; 5535 5536 mddev->queue->limits.discard_alignment = stripe; 5536 5537 mddev->queue->limits.discard_granularity = stripe; 5537 5538 /*