Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs fixes from Chris Mason:
"The first commit is a fix from Filipe for a very old extent buffer
reuse race that triggered a BUG_ON. It hasn't come up often, I looked
through old logs at FB and we hit it a handful of times over the last
year.

The rest are other corners he hit during testing"

* 'for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs:
Btrfs: fix race when reusing stale extent buffers that leads to BUG_ON
Btrfs: fix race between block group creation and their cache writeout
Btrfs: fix panic when starting bg cache writeout after IO error
Btrfs: fix crash after inode cache writeback failure

+68 -10
+27 -4
fs/btrfs/extent-tree.c
··· 3180 3180 btrfs_mark_buffer_dirty(leaf); 3181 3181 fail: 3182 3182 btrfs_release_path(path); 3183 - if (ret) 3184 - btrfs_abort_transaction(trans, root, ret); 3185 3183 return ret; 3186 3184 3187 3185 } ··· 3485 3487 ret = 0; 3486 3488 } 3487 3489 } 3488 - if (!ret) 3490 + if (!ret) { 3489 3491 ret = write_one_cache_group(trans, root, path, cache); 3492 + /* 3493 + * Our block group might still be attached to the list 3494 + * of new block groups in the transaction handle of some 3495 + * other task (struct btrfs_trans_handle->new_bgs). This 3496 + * means its block group item isn't yet in the extent 3497 + * tree. If this happens ignore the error, as we will 3498 + * try again later in the critical section of the 3499 + * transaction commit. 3500 + */ 3501 + if (ret == -ENOENT) { 3502 + ret = 0; 3503 + spin_lock(&cur_trans->dirty_bgs_lock); 3504 + if (list_empty(&cache->dirty_list)) { 3505 + list_add_tail(&cache->dirty_list, 3506 + &cur_trans->dirty_bgs); 3507 + btrfs_get_block_group(cache); 3508 + } 3509 + spin_unlock(&cur_trans->dirty_bgs_lock); 3510 + } else if (ret) { 3511 + btrfs_abort_transaction(trans, root, ret); 3512 + } 3513 + } 3490 3514 3491 3515 /* if its not on the io list, we need to put the block group */ 3492 3516 if (should_put) ··· 3617 3597 ret = 0; 3618 3598 } 3619 3599 } 3620 - if (!ret) 3600 + if (!ret) { 3621 3601 ret = write_one_cache_group(trans, root, path, cache); 3602 + if (ret) 3603 + btrfs_abort_transaction(trans, root, ret); 3604 + } 3622 3605 3623 3606 /* if its not on the io list, we need to put the block group */ 3624 3607 if (should_put)
+19
fs/btrfs/extent_io.c
··· 4772 4772 start >> PAGE_CACHE_SHIFT); 4773 4773 if (eb && atomic_inc_not_zero(&eb->refs)) { 4774 4774 rcu_read_unlock(); 4775 + /* 4776 + * Lock our eb's refs_lock to avoid races with 4777 + * free_extent_buffer. When we get our eb it might be flagged 4778 + * with EXTENT_BUFFER_STALE and another task running 4779 + * free_extent_buffer might have seen that flag set, 4780 + * eb->refs == 2, that the buffer isn't under IO (dirty and 4781 + * writeback flags not set) and it's still in the tree (flag 4782 + * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4783 + * of decrementing the extent buffer's reference count twice. 4784 + * So here we could race and increment the eb's reference count, 4785 + * clear its stale flag, mark it as dirty and drop our reference 4786 + * before the other task finishes executing free_extent_buffer, 4787 + * which would later result in an attempt to free an extent 4788 + * buffer that is dirty. 4789 + */ 4790 + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4791 + spin_lock(&eb->refs_lock); 4792 + spin_unlock(&eb->refs_lock); 4793 + } 4775 4794 mark_extent_buffer_accessed(eb, NULL); 4776 4795 return eb; 4777 4796 }
+12 -2
fs/btrfs/free-space-cache.c
··· 3466 3466 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 3467 3467 int ret; 3468 3468 struct btrfs_io_ctl io_ctl; 3469 + bool release_metadata = true; 3469 3470 3470 3471 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 3471 3472 return 0; ··· 3474 3473 memset(&io_ctl, 0, sizeof(io_ctl)); 3475 3474 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, 3476 3475 trans, path, 0); 3477 - if (!ret) 3476 + if (!ret) { 3477 + /* 3478 + * At this point writepages() didn't error out, so our metadata 3479 + * reservation is released when the writeback finishes, at 3480 + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing 3481 + * with or without an error. 3482 + */ 3483 + release_metadata = false; 3478 3484 ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); 3485 + } 3479 3486 3480 3487 if (ret) { 3481 - btrfs_delalloc_release_metadata(inode, inode->i_size); 3488 + if (release_metadata) 3489 + btrfs_delalloc_release_metadata(inode, inode->i_size); 3482 3490 #ifdef DEBUG 3483 3491 btrfs_err(root->fs_info, 3484 3492 "failed to write free ino cache for root %llu",
+10 -4
fs/btrfs/ordered-data.c
··· 722 722 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 723 723 { 724 724 int ret = 0; 725 + int ret_wb = 0; 725 726 u64 end; 726 727 u64 orig_end; 727 728 struct btrfs_ordered_extent *ordered; ··· 742 741 if (ret) 743 742 return ret; 744 743 745 - ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 746 - if (ret) 747 - return ret; 744 + /* 745 + * If we have a writeback error don't return immediately. Wait first 746 + * for any ordered extents that haven't completed yet. This is to make 747 + * sure no one can dirty the same page ranges and call writepages() 748 + * before the ordered extents complete - to avoid failures (-EEXIST) 749 + * when adding the new ordered extents to the ordered tree. 750 + */ 751 + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 748 752 749 753 end = orig_end; 750 754 while (1) { ··· 773 767 break; 774 768 end--; 775 769 } 776 - return ret; 770 + return ret_wb ? ret_wb : ret; 777 771 } 778 772 779 773 /*