Merge branch 'for-linus-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

-2

fs/btrfs/btrfs_inode.h

··· 44 44 #define BTRFS_INODE_IN_DELALLOC_LIST 9 45 45 #define BTRFS_INODE_READDIO_NEED_LOCK 10 46 46 #define BTRFS_INODE_HAS_PROPS 11 47 - /* DIO is ready to submit */ 48 - #define BTRFS_INODE_DIO_READY 12 49 47 /* 50 48 * The following 3 bits are meant only for the btree inode. 51 49 * When any of them is set, it means an error happened while writing an

-2

fs/btrfs/disk-io.c

··· 3765 3765 * block groups queued for removal, the deletion will be 3766 3766 * skipped when we quit the cleaner thread. 3767 3767 */ 3768 - mutex_lock(&root->fs_info->cleaner_mutex); 3769 3768 btrfs_delete_unused_bgs(root->fs_info); 3770 - mutex_unlock(&root->fs_info->cleaner_mutex); 3771 3769 3772 3770 ret = btrfs_commit_super(root); 3773 3771 if (ret)

+2 -5

fs/btrfs/extent-tree.c

··· 3742 3742 found->bytes_reserved = 0; 3743 3743 found->bytes_readonly = 0; 3744 3744 found->bytes_may_use = 0; 3745 - if (total_bytes > 0) 3746 - found->full = 0; 3747 - else 3748 - found->full = 1; 3745 + found->full = 0; 3749 3746 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3750 3747 found->chunk_alloc = 0; 3751 3748 found->flush = 0; ··· 8665 8668 } 8666 8669 8667 8670 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 8668 - btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 8671 + btrfs_add_dropped_root(trans, root); 8669 8672 } else { 8670 8673 free_extent_buffer(root->node); 8671 8674 free_extent_buffer(root->commit_root);

+57 -8

fs/btrfs/extent_io.c

··· 2798 2798 bio_end_io_t end_io_func, 2799 2799 int mirror_num, 2800 2800 unsigned long prev_bio_flags, 2801 - unsigned long bio_flags) 2801 + unsigned long bio_flags, 2802 + bool force_bio_submit) 2802 2803 { 2803 2804 int ret = 0; 2804 2805 struct bio *bio; ··· 2815 2814 contig = bio_end_sector(bio) == sector; 2816 2815 2817 2816 if (prev_bio_flags != bio_flags || !contig || 2817 + force_bio_submit || 2818 2818 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2819 2819 bio_add_page(bio, page, page_size, offset) < page_size) { 2820 2820 ret = submit_one_bio(rw, bio, mirror_num, ··· 2912 2910 get_extent_t *get_extent, 2913 2911 struct extent_map **em_cached, 2914 2912 struct bio **bio, int mirror_num, 2915 - unsigned long *bio_flags, int rw) 2913 + unsigned long *bio_flags, int rw, 2914 + u64 *prev_em_start) 2916 2915 { 2917 2916 struct inode *inode = page->mapping->host; 2918 2917 u64 start = page_offset(page); ··· 2961 2958 } 2962 2959 while (cur <= end) { 2963 2960 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2961 + bool force_bio_submit = false; 2964 2962 2965 2963 if (cur >= last_byte) { 2966 2964 char *userpage; ··· 3012 3008 block_start = em->block_start; 3013 3009 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3014 3010 block_start = EXTENT_MAP_HOLE; 3011 + 3012 + /* 3013 + * If we have a file range that points to a compressed extent 3014 + * and it's followed by a consecutive file range that points to 3015 + * to the same compressed extent (possibly with a different 3016 + * offset and/or length, so it either points to the whole extent 3017 + * or only part of it), we must make sure we do not submit a 3018 + * single bio to populate the pages for the 2 ranges because 3019 + * this makes the compressed extent read zero out the pages 3020 + * belonging to the 2nd range. Imagine the following scenario: 3021 + * 3022 + * File layout 3023 + * [0 - 8K] [8K - 24K] 3024 + * | | 3025 + * | | 3026 + * points to extent X, points to extent X, 3027 + * offset 4K, length of 8K offset 0, length 16K 3028 + * 3029 + * [extent X, compressed length = 4K uncompressed length = 16K] 3030 + * 3031 + * If the bio to read the compressed extent covers both ranges, 3032 + * it will decompress extent X into the pages belonging to the 3033 + * first range and then it will stop, zeroing out the remaining 3034 + * pages that belong to the other range that points to extent X. 3035 + * So here we make sure we submit 2 bios, one for the first 3036 + * range and another one for the third range. Both will target 3037 + * the same physical extent from disk, but we can't currently 3038 + * make the compressed bio endio callback populate the pages 3039 + * for both ranges because each compressed bio is tightly 3040 + * coupled with a single extent map, and each range can have 3041 + * an extent map with a different offset value relative to the 3042 + * uncompressed data of our extent and different lengths. This 3043 + * is a corner case so we prioritize correctness over 3044 + * non-optimal behavior (submitting 2 bios for the same extent). 3045 + */ 3046 + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3047 + prev_em_start && *prev_em_start != (u64)-1 && 3048 + *prev_em_start != em->orig_start) 3049 + force_bio_submit = true; 3050 + 3051 + if (prev_em_start) 3052 + *prev_em_start = em->orig_start; 3053 + 3015 3054 free_extent_map(em); 3016 3055 em = NULL; 3017 3056 ··· 3104 3057 bdev, bio, pnr, 3105 3058 end_bio_extent_readpage, mirror_num, 3106 3059 *bio_flags, 3107 - this_bio_flag); 3060 + this_bio_flag, 3061 + force_bio_submit); 3108 3062 if (!ret) { 3109 3063 nr++; 3110 3064 *bio_flags = this_bio_flag; ··· 3137 3089 struct inode *inode; 3138 3090 struct btrfs_ordered_extent *ordered; 3139 3091 int index; 3092 + u64 prev_em_start = (u64)-1; 3140 3093 3141 3094 inode = pages[0]->mapping->host; 3142 3095 while (1) { ··· 3153 3104 3154 3105 for (index = 0; index < nr_pages; index++) { 3155 3106 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3156 - mirror_num, bio_flags, rw); 3107 + mirror_num, bio_flags, rw, &prev_em_start); 3157 3108 page_cache_release(pages[index]); 3158 3109 } 3159 3110 } ··· 3221 3172 } 3222 3173 3223 3174 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3224 - bio_flags, rw); 3175 + bio_flags, rw, NULL); 3225 3176 return ret; 3226 3177 } 3227 3178 ··· 3247 3198 int ret; 3248 3199 3249 3200 ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, 3250 - &bio_flags, READ); 3201 + &bio_flags, READ, NULL); 3251 3202 if (bio) 3252 3203 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 3253 3204 return ret; ··· 3500 3451 sector, iosize, pg_offset, 3501 3452 bdev, &epd->bio, max_nr, 3502 3453 end_bio_extent_writepage, 3503 - 0, 0, 0); 3454 + 0, 0, 0, false); 3504 3455 if (ret) 3505 3456 SetPageError(page); 3506 3457 } ··· 3803 3754 ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, 3804 3755 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3805 3756 -1, end_bio_extent_buffer_writepage, 3806 - 0, epd->bio_flags, bio_flags); 3757 + 0, epd->bio_flags, bio_flags, false); 3807 3758 epd->bio_flags = bio_flags; 3808 3759 if (ret) { 3809 3760 set_btree_ioerr(p);

+23 -22

fs/btrfs/inode.c

··· 5084 5084 goto no_delete; 5085 5085 } 5086 5086 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 5087 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 5087 + if (!special_file(inode->i_mode)) 5088 + btrfs_wait_ordered_range(inode, 0, (u64)-1); 5088 5089 5089 5090 btrfs_free_io_failure_record(inode, 0, (u64)-1); 5090 5091 ··· 7409 7408 return em; 7410 7409 } 7411 7410 7411 + struct btrfs_dio_data { 7412 + u64 outstanding_extents; 7413 + u64 reserve; 7414 + }; 7412 7415 7413 7416 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, 7414 7417 struct buffer_head *bh_result, int create) ··· 7420 7415 struct extent_map *em; 7421 7416 struct btrfs_root *root = BTRFS_I(inode)->root; 7422 7417 struct extent_state *cached_state = NULL; 7418 + struct btrfs_dio_data *dio_data = NULL; 7423 7419 u64 start = iblock << inode->i_blkbits; 7424 7420 u64 lockstart, lockend; 7425 7421 u64 len = bh_result->b_size; 7426 - u64 *outstanding_extents = NULL; 7427 7422 int unlock_bits = EXTENT_LOCKED; 7428 7423 int ret = 0; 7429 7424 ··· 7441 7436 * that anything that needs to check if there's a transction doesn't get 7442 7437 * confused. 7443 7438 */ 7444 - outstanding_extents = current->journal_info; 7439 + dio_data = current->journal_info; 7445 7440 current->journal_info = NULL; 7446 7441 } 7447 7442 ··· 7573 7568 * within our reservation, otherwise we need to adjust our inode 7574 7569 * counter appropriately. 7575 7570 */ 7576 - if (*outstanding_extents) { 7577 - (*outstanding_extents)--; 7571 + if (dio_data->outstanding_extents) { 7572 + (dio_data->outstanding_extents)--; 7578 7573 } else { 7579 7574 spin_lock(&BTRFS_I(inode)->lock); 7580 7575 BTRFS_I(inode)->outstanding_extents++; 7581 7576 spin_unlock(&BTRFS_I(inode)->lock); 7582 7577 } 7583 7578 7584 - current->journal_info = outstanding_extents; 7585 7579 btrfs_free_reserved_data_space(inode, len); 7586 - set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); 7580 + WARN_ON(dio_data->reserve < len); 7581 + dio_data->reserve -= len; 7582 + current->journal_info = dio_data; 7587 7583 } 7588 7584 7589 7585 /* ··· 7607 7601 unlock_err: 7608 7602 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7609 7603 unlock_bits, 1, 0, &cached_state, GFP_NOFS); 7610 - if (outstanding_extents) 7611 - current->journal_info = outstanding_extents; 7604 + if (dio_data) 7605 + current->journal_info = dio_data; 7612 7606 return ret; 7613 7607 } 7614 7608 ··· 8335 8329 { 8336 8330 struct file *file = iocb->ki_filp; 8337 8331 struct inode *inode = file->f_mapping->host; 8338 - u64 outstanding_extents = 0; 8332 + struct btrfs_root *root = BTRFS_I(inode)->root; 8333 + struct btrfs_dio_data dio_data = { 0 }; 8339 8334 size_t count = 0; 8340 8335 int flags = 0; 8341 8336 bool wakeup = true; ··· 8374 8367 ret = btrfs_delalloc_reserve_space(inode, count); 8375 8368 if (ret) 8376 8369 goto out; 8377 - outstanding_extents = div64_u64(count + 8370 + dio_data.outstanding_extents = div64_u64(count + 8378 8371 BTRFS_MAX_EXTENT_SIZE - 1, 8379 8372 BTRFS_MAX_EXTENT_SIZE); 8380 8373 ··· 8383 8376 * do the accounting properly if we go over the number we 8384 8377 * originally calculated. Abuse current->journal_info for this. 8385 8378 */ 8386 - current->journal_info = &outstanding_extents; 8379 + dio_data.reserve = round_up(count, root->sectorsize); 8380 + current->journal_info = &dio_data; 8387 8381 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 8388 8382 &BTRFS_I(inode)->runtime_flags)) { 8389 8383 inode_dio_end(inode); ··· 8399 8391 if (iov_iter_rw(iter) == WRITE) { 8400 8392 current->journal_info = NULL; 8401 8393 if (ret < 0 && ret != -EIOCBQUEUED) { 8402 - /* 8403 - * If the error comes from submitting stage, 8404 - * btrfs_get_blocsk_direct() has free'd data space, 8405 - * and metadata space will be handled by 8406 - * finish_ordered_fn, don't do that again to make 8407 - * sure bytes_may_use is correct. 8408 - */ 8409 - if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, 8410 - &BTRFS_I(inode)->runtime_flags)) 8411 - btrfs_delalloc_release_space(inode, count); 8394 + if (dio_data.reserve) 8395 + btrfs_delalloc_release_space(inode, 8396 + dio_data.reserve); 8412 8397 } else if (ret >= 0 && (size_t)ret < count) 8413 8398 btrfs_delalloc_release_space(inode, 8414 8399 count - (size_t)ret);

-2

fs/btrfs/super.c

··· 1658 1658 * groups on disk until we're mounted read-write again 1659 1659 * unless we clean them up here. 1660 1660 */ 1661 - mutex_lock(&root->fs_info->cleaner_mutex); 1662 1661 btrfs_delete_unused_bgs(fs_info); 1663 - mutex_unlock(&root->fs_info->cleaner_mutex); 1664 1662 1665 1663 btrfs_dev_replace_suspend_for_unmount(fs_info); 1666 1664 btrfs_scrub_cancel(fs_info);

+32

fs/btrfs/transaction.c

··· 117 117 btrfs_unpin_free_ino(root); 118 118 clear_btree_io_tree(&root->dirty_log_pages); 119 119 } 120 + 121 + /* We can free old roots now. */ 122 + spin_lock(&trans->dropped_roots_lock); 123 + while (!list_empty(&trans->dropped_roots)) { 124 + root = list_first_entry(&trans->dropped_roots, 125 + struct btrfs_root, root_list); 126 + list_del_init(&root->root_list); 127 + spin_unlock(&trans->dropped_roots_lock); 128 + btrfs_drop_and_free_fs_root(fs_info, root); 129 + spin_lock(&trans->dropped_roots_lock); 130 + } 131 + spin_unlock(&trans->dropped_roots_lock); 120 132 up_write(&fs_info->commit_root_sem); 121 133 } 122 134 ··· 267 255 INIT_LIST_HEAD(&cur_trans->pending_ordered); 268 256 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 269 257 INIT_LIST_HEAD(&cur_trans->io_bgs); 258 + INIT_LIST_HEAD(&cur_trans->dropped_roots); 270 259 mutex_init(&cur_trans->cache_write_mutex); 271 260 cur_trans->num_dirty_bgs = 0; 272 261 spin_lock_init(&cur_trans->dirty_bgs_lock); 273 262 INIT_LIST_HEAD(&cur_trans->deleted_bgs); 274 263 spin_lock_init(&cur_trans->deleted_bgs_lock); 264 + spin_lock_init(&cur_trans->dropped_roots_lock); 275 265 list_add_tail(&cur_trans->list, &fs_info->trans_list); 276 266 extent_io_tree_init(&cur_trans->dirty_pages, 277 267 fs_info->btree_inode->i_mapping); ··· 349 335 return 0; 350 336 } 351 337 338 + 339 + void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, 340 + struct btrfs_root *root) 341 + { 342 + struct btrfs_transaction *cur_trans = trans->transaction; 343 + 344 + /* Add ourselves to the transaction dropped list */ 345 + spin_lock(&cur_trans->dropped_roots_lock); 346 + list_add_tail(&root->root_list, &cur_trans->dropped_roots); 347 + spin_unlock(&cur_trans->dropped_roots_lock); 348 + 349 + /* Make sure we don't try to update the root at commit time */ 350 + spin_lock(&root->fs_info->fs_roots_radix_lock); 351 + radix_tree_tag_clear(&root->fs_info->fs_roots_radix, 352 + (unsigned long)root->root_key.objectid, 353 + BTRFS_ROOT_TRANS_TAG); 354 + spin_unlock(&root->fs_info->fs_roots_radix_lock); 355 + } 352 356 353 357 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 354 358 struct btrfs_root *root)

+4 -1

fs/btrfs/transaction.h

··· 65 65 struct list_head switch_commits; 66 66 struct list_head dirty_bgs; 67 67 struct list_head io_bgs; 68 + struct list_head dropped_roots; 68 69 u64 num_dirty_bgs; 69 70 70 71 /* ··· 77 76 spinlock_t dirty_bgs_lock; 78 77 struct list_head deleted_bgs; 79 78 spinlock_t deleted_bgs_lock; 79 + spinlock_t dropped_roots_lock; 80 80 struct btrfs_delayed_ref_root delayed_refs; 81 81 int aborted; 82 82 int dirty_bg_run; ··· 218 216 int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 219 217 void btrfs_put_transaction(struct btrfs_transaction *transaction); 220 218 void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); 221 - 219 + void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, 220 + struct btrfs_root *root); 222 221 #endif

Configure Feed

Configure Feed