Merge tag 'for-5.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+5 -2

fs/btrfs/backref.c

··· 789 789 if (IS_ERR(eb)) { 790 790 free_pref(ref); 791 791 return PTR_ERR(eb); 792 - } else if (!extent_buffer_uptodate(eb)) { 792 + } 793 + if (!extent_buffer_uptodate(eb)) { 793 794 free_pref(ref); 794 795 free_extent_buffer(eb); 795 796 return -EIO; 796 797 } 798 + 797 799 if (lock) 798 800 btrfs_tree_read_lock(eb); 799 801 if (btrfs_header_level(eb) == 0) ··· 1337 1335 if (IS_ERR(eb)) { 1338 1336 ret = PTR_ERR(eb); 1339 1337 goto out; 1340 - } else if (!extent_buffer_uptodate(eb)) { 1338 + } 1339 + if (!extent_buffer_uptodate(eb)) { 1341 1340 free_extent_buffer(eb); 1342 1341 ret = -EIO; 1343 1342 goto out;

+33 -3

fs/btrfs/block-group.c

··· 1522 1522 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1523 1523 return; 1524 1524 1525 - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) 1525 + sb_start_write(fs_info->sb); 1526 + 1527 + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 1528 + sb_end_write(fs_info->sb); 1526 1529 return; 1530 + } 1527 1531 1528 1532 /* 1529 1533 * Long running balances can keep us blocked here for eternity, so ··· 1535 1531 */ 1536 1532 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { 1537 1533 btrfs_exclop_finish(fs_info); 1534 + sb_end_write(fs_info->sb); 1538 1535 return; 1539 1536 } 1540 1537 ··· 1610 1605 spin_unlock(&fs_info->unused_bgs_lock); 1611 1606 mutex_unlock(&fs_info->reclaim_bgs_lock); 1612 1607 btrfs_exclop_finish(fs_info); 1608 + sb_end_write(fs_info->sb); 1613 1609 } 1614 1610 1615 1611 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) ··· 2012 2006 cache->length = key->offset; 2013 2007 cache->used = btrfs_stack_block_group_used(bgi); 2014 2008 cache->flags = btrfs_stack_block_group_flags(bgi); 2009 + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 2015 2010 2016 2011 set_free_space_tree_thresholds(cache); 2017 2012 ··· 2295 2288 spin_lock(&block_group->lock); 2296 2289 btrfs_set_stack_block_group_used(&bgi, block_group->used); 2297 2290 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2298 - BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2291 + block_group->global_root_id); 2299 2292 btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2300 2293 key.objectid = block_group->start; 2301 2294 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; ··· 2451 2444 btrfs_trans_release_chunk_metadata(trans); 2452 2445 } 2453 2446 2447 + /* 2448 + * For extent tree v2 we use the block_group_item->chunk_offset to point at our 2449 + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. 2450 + */ 2451 + static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) 2452 + { 2453 + u64 div = SZ_1G; 2454 + u64 index; 2455 + 2456 + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 2457 + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2458 + 2459 + /* If we have a smaller fs index based on 128MiB. */ 2460 + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) 2461 + div = SZ_128M; 2462 + 2463 + offset = div64_u64(offset, div); 2464 + div64_u64_rem(offset, fs_info->nr_global_roots, &index); 2465 + return index; 2466 + } 2467 + 2454 2468 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, 2455 2469 u64 bytes_used, u64 type, 2456 2470 u64 chunk_offset, u64 size) ··· 2492 2464 cache->flags = type; 2493 2465 cache->last_byte_to_unpin = (u64)-1; 2494 2466 cache->cached = BTRFS_CACHE_FINISHED; 2467 + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); 2468 + 2495 2469 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 2496 2470 cache->needs_free_space = 1; 2497 2471 ··· 2723 2693 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2724 2694 btrfs_set_stack_block_group_used(&bgi, cache->used); 2725 2695 btrfs_set_stack_block_group_chunk_objectid(&bgi, 2726 - BTRFS_FIRST_CHUNK_TREE_OBJECTID); 2696 + cache->global_root_id); 2727 2697 btrfs_set_stack_block_group_flags(&bgi, cache->flags); 2728 2698 write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 2729 2699 btrfs_mark_buffer_dirty(leaf);

+1

fs/btrfs/block-group.h

··· 68 68 u64 bytes_super; 69 69 u64 flags; 70 70 u64 cache_generation; 71 + u64 global_root_id; 71 72 72 73 /* 73 74 * If the free space extent count exceeds this number, convert the block

+40 -2

fs/btrfs/btrfs_inode.h

··· 14 14 #include "delayed-inode.h" 15 15 16 16 /* 17 + * Since we search a directory based on f_pos (struct dir_context::pos) we have 18 + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so 19 + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). 20 + */ 21 + #define BTRFS_DIR_START_INDEX 2 22 + 23 + /* 17 24 * ordered_data_close is set by truncate when a file that used 18 25 * to have good data has been truncated to zero. When it is set 19 26 * the btrfs file release call will add this inode to the ··· 180 173 u64 disk_i_size; 181 174 182 175 /* 183 - * if this is a directory then index_cnt is the counter for the index 184 - * number for new files that are created 176 + * If this is a directory then index_cnt is the counter for the index 177 + * number for new files that are created. For an empty directory, this 178 + * must be initialized to BTRFS_DIR_START_INDEX. 185 179 */ 186 180 u64 index_cnt; 187 181 ··· 338 330 { 339 331 spin_lock(&inode->lock); 340 332 inode->last_sub_trans = inode->root->log_transid; 333 + spin_unlock(&inode->lock); 334 + } 335 + 336 + /* 337 + * Should be called while holding the inode's VFS lock in exclusive mode or in a 338 + * context where no one else can access the inode concurrently (during inode 339 + * creation or when loading an inode from disk). 340 + */ 341 + static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) 342 + { 343 + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 344 + /* 345 + * The inode may have been part of a reflink operation in the last 346 + * transaction that modified it, and then a fsync has reset the 347 + * last_reflink_trans to avoid subsequent fsyncs in the same 348 + * transaction to do unnecessary work. So update last_reflink_trans 349 + * to the last_trans value (we have to be pessimistic and assume a 350 + * reflink happened). 351 + * 352 + * The ->last_trans is protected by the inode's spinlock and we can 353 + * have a concurrent ordered extent completion update it. Also set 354 + * last_reflink_trans to ->last_trans only if the former is less than 355 + * the later, because we can be called in a context where 356 + * last_reflink_trans was set to the current transaction generation 357 + * while ->last_trans was not yet updated in the current transaction, 358 + * and therefore has a lower value. 359 + */ 360 + spin_lock(&inode->lock); 361 + if (inode->last_reflink_trans < inode->last_trans) 362 + inode->last_reflink_trans = inode->last_trans; 341 363 spin_unlock(&inode->lock); 342 364 } 343 365

+37 -26

fs/btrfs/compression.c

··· 219 219 bi_size += bvec->bv_len; 220 220 221 221 if (bio->bi_status) 222 - cb->errors = 1; 222 + cb->status = bio->bi_status; 223 223 224 224 ASSERT(bi_size && bi_size <= cb->compressed_len); 225 225 last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, ··· 234 234 return last_io; 235 235 } 236 236 237 - static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) 237 + static void finish_compressed_bio_read(struct compressed_bio *cb) 238 238 { 239 239 unsigned int index; 240 240 struct page *page; ··· 247 247 } 248 248 249 249 /* Do io completion on the original bio */ 250 - if (cb->errors) { 251 - bio_io_error(cb->orig_bio); 250 + if (cb->status != BLK_STS_OK) { 251 + cb->orig_bio->bi_status = cb->status; 252 + bio_endio(cb->orig_bio); 252 253 } else { 253 254 struct bio_vec *bvec; 254 255 struct bvec_iter_all iter_all; 255 256 256 - ASSERT(bio); 257 - ASSERT(!bio->bi_status); 258 257 /* 259 258 * We have verified the checksum already, set page checked so 260 259 * the end_io handlers know about it 261 260 */ 262 - ASSERT(!bio_flagged(bio, BIO_CLONED)); 261 + ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); 263 262 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { 264 263 u64 bvec_start = page_offset(bvec->bv_page) + 265 264 bvec->bv_offset; ··· 307 308 * Some IO in this cb have failed, just skip checksum as there 308 309 * is no way it could be correct. 309 310 */ 310 - if (cb->errors == 1) 311 + if (cb->status != BLK_STS_OK) 311 312 goto csum_failed; 312 313 313 314 inode = cb->inode; ··· 323 324 324 325 csum_failed: 325 326 if (ret) 326 - cb->errors = 1; 327 - finish_compressed_bio_read(cb, bio); 327 + cb->status = errno_to_blk_status(ret); 328 + finish_compressed_bio_read(cb); 328 329 out: 329 330 bio_put(bio); 330 331 } ··· 341 342 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; 342 343 struct page *pages[16]; 343 344 unsigned long nr_pages = end_index - index + 1; 345 + const int errno = blk_status_to_errno(cb->status); 344 346 int i; 345 347 int ret; 346 348 347 - if (cb->errors) 348 - mapping_set_error(inode->i_mapping, -EIO); 349 + if (errno) 350 + mapping_set_error(inode->i_mapping, errno); 349 351 350 352 while (nr_pages > 0) { 351 353 ret = find_get_pages_contig(inode->i_mapping, index, ··· 358 358 continue; 359 359 } 360 360 for (i = 0; i < ret; i++) { 361 - if (cb->errors) 361 + if (errno) 362 362 SetPageError(pages[i]); 363 363 btrfs_page_clamp_clear_writeback(fs_info, pages[i], 364 364 cb->start, cb->len); ··· 381 381 */ 382 382 btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, 383 383 cb->start, cb->start + cb->len - 1, 384 - !cb->errors); 384 + cb->status == BLK_STS_OK); 385 385 386 - end_compressed_writeback(inode, cb); 386 + if (cb->writeback) 387 + end_compressed_writeback(inode, cb); 387 388 /* Note, our inode could be gone now */ 388 389 389 390 /* ··· 507 506 struct page **compressed_pages, 508 507 unsigned int nr_pages, 509 508 unsigned int write_flags, 510 - struct cgroup_subsys_state *blkcg_css) 509 + struct cgroup_subsys_state *blkcg_css, 510 + bool writeback) 511 511 { 512 512 struct btrfs_fs_info *fs_info = inode->root->fs_info; 513 513 struct bio *bio = NULL; ··· 526 524 if (!cb) 527 525 return BLK_STS_RESOURCE; 528 526 refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); 529 - cb->errors = 0; 527 + cb->status = BLK_STS_OK; 530 528 cb->inode = &inode->vfs_inode; 531 529 cb->start = start; 532 530 cb->len = len; 533 531 cb->mirror_num = 0; 534 532 cb->compressed_pages = compressed_pages; 535 533 cb->compressed_len = compressed_len; 534 + cb->writeback = writeback; 536 535 cb->orig_bio = NULL; 537 536 cb->nr_pages = nr_pages; 538 537 ··· 594 591 595 592 if (submit) { 596 593 if (!skip_sum) { 597 - ret = btrfs_csum_one_bio(inode, bio, start, 1); 594 + ret = btrfs_csum_one_bio(inode, bio, start, true); 598 595 if (ret) 599 596 goto finish_cb; 600 597 } ··· 811 808 u64 em_len; 812 809 u64 em_start; 813 810 struct extent_map *em; 814 - blk_status_t ret = BLK_STS_RESOURCE; 811 + blk_status_t ret; 815 812 int faili = 0; 816 813 u8 *sums; 817 814 ··· 824 821 read_lock(&em_tree->lock); 825 822 em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); 826 823 read_unlock(&em_tree->lock); 827 - if (!em) 828 - return BLK_STS_IOERR; 824 + if (!em) { 825 + ret = BLK_STS_IOERR; 826 + goto out; 827 + } 829 828 830 829 ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); 831 830 compressed_len = em->block_len; 832 831 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 833 - if (!cb) 832 + if (!cb) { 833 + ret = BLK_STS_RESOURCE; 834 834 goto out; 835 + } 835 836 836 837 refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); 837 - cb->errors = 0; 838 + cb->status = BLK_STS_OK; 838 839 cb->inode = inode; 839 840 cb->mirror_num = mirror_num; 840 841 sums = cb->sums; ··· 858 851 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 859 852 cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), 860 853 GFP_NOFS); 861 - if (!cb->compressed_pages) 854 + if (!cb->compressed_pages) { 855 + ret = BLK_STS_RESOURCE; 862 856 goto fail1; 857 + } 863 858 864 859 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 865 860 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); ··· 947 938 comp_bio = NULL; 948 939 } 949 940 } 950 - return 0; 941 + return BLK_STS_OK; 951 942 952 943 fail2: 953 944 while (faili >= 0) { ··· 960 951 kfree(cb); 961 952 out: 962 953 free_extent_map(em); 954 + bio->bi_status = ret; 955 + bio_endio(bio); 963 956 return ret; 964 957 finish_cb: 965 958 if (comp_bio) { ··· 981 970 */ 982 971 ASSERT(refcount_read(&cb->pending_sectors)); 983 972 /* Now we are the only one referring @cb, can finish it safely. */ 984 - finish_compressed_bio_read(cb, NULL); 973 + finish_compressed_bio_read(cb); 985 974 return ret; 986 975 } 987 976

+8 -2

fs/btrfs/compression.h

··· 22 22 23 23 /* Maximum length of compressed data stored on disk */ 24 24 #define BTRFS_MAX_COMPRESSED (SZ_128K) 25 + static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); 26 + 25 27 /* Maximum size of data before compression */ 26 28 #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) 27 29 ··· 54 52 /* The compression algorithm for this bio */ 55 53 u8 compress_type; 56 54 55 + /* Whether this is a write for writeback. */ 56 + bool writeback; 57 + 57 58 /* IO errors */ 58 - u8 errors; 59 + blk_status_t status; 59 60 int mirror_num; 60 61 61 62 /* for reads, this is the bio we are copying the data into */ ··· 100 95 struct page **compressed_pages, 101 96 unsigned int nr_pages, 102 97 unsigned int write_flags, 103 - struct cgroup_subsys_state *blkcg_css); 98 + struct cgroup_subsys_state *blkcg_css, 99 + bool writeback); 104 100 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 105 101 int mirror_num, unsigned long bio_flags); 106 102

+61 -47

fs/btrfs/ctree.c

··· 846 846 btrfs_header_owner(parent), 847 847 btrfs_node_ptr_generation(parent, slot), 848 848 level - 1, &first_key); 849 - if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { 849 + if (IS_ERR(eb)) 850 + return eb; 851 + if (!extent_buffer_uptodate(eb)) { 850 852 free_extent_buffer(eb); 851 - eb = ERR_PTR(-EIO); 853 + return ERR_PTR(-EIO); 852 854 } 853 855 854 856 return eb; ··· 1438 1436 1439 1437 /* now we're allowed to do a blocking uptodate check */ 1440 1438 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 1441 - if (!ret) { 1442 - *eb_ret = tmp; 1443 - return 0; 1439 + if (ret) { 1440 + free_extent_buffer(tmp); 1441 + btrfs_release_path(p); 1442 + return -EIO; 1444 1443 } 1445 - free_extent_buffer(tmp); 1446 - btrfs_release_path(p); 1447 - return -EIO; 1444 + *eb_ret = tmp; 1445 + return 0; 1448 1446 } 1449 1447 1450 1448 /* ··· 1462 1460 ret = -EAGAIN; 1463 1461 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1464 1462 gen, parent_level - 1, &first_key); 1465 - if (!IS_ERR(tmp)) { 1466 - /* 1467 - * If the read above didn't mark this buffer up to date, 1468 - * it will never end up being up to date. Set ret to EIO now 1469 - * and give up so that our caller doesn't loop forever 1470 - * on our EAGAINs. 1471 - */ 1472 - if (!extent_buffer_uptodate(tmp)) 1473 - ret = -EIO; 1474 - free_extent_buffer(tmp); 1475 - } else { 1476 - ret = PTR_ERR(tmp); 1463 + if (IS_ERR(tmp)) { 1464 + btrfs_release_path(p); 1465 + return PTR_ERR(tmp); 1477 1466 } 1467 + /* 1468 + * If the read above didn't mark this buffer up to date, 1469 + * it will never end up being up to date. Set ret to EIO now 1470 + * and give up so that our caller doesn't loop forever 1471 + * on our EAGAINs. 1472 + */ 1473 + if (!extent_buffer_uptodate(tmp)) 1474 + ret = -EIO; 1475 + free_extent_buffer(tmp); 1478 1476 1479 1477 btrfs_release_path(p); 1480 1478 return ret; ··· 2992 2990 if (free_space < data_size) 2993 2991 goto out_unlock; 2994 2992 2995 - /* cow and double check */ 2996 2993 ret = btrfs_cow_block(trans, root, right, upper, 2997 2994 slot + 1, &right, BTRFS_NESTING_RIGHT_COW); 2998 2995 if (ret) 2999 - goto out_unlock; 3000 - 3001 - free_space = btrfs_leaf_free_space(right); 3002 - if (free_space < data_size) 3003 2996 goto out_unlock; 3004 2997 3005 2998 left_nritems = btrfs_header_nritems(left); ··· 3221 3224 goto out; 3222 3225 } 3223 3226 3224 - /* cow and double check */ 3225 3227 ret = btrfs_cow_block(trans, root, left, 3226 3228 path->nodes[1], slot - 1, &left, 3227 3229 BTRFS_NESTING_LEFT_COW); ··· 3228 3232 /* we hit -ENOSPC, but it isn't fatal here */ 3229 3233 if (ret == -ENOSPC) 3230 3234 ret = 1; 3231 - goto out; 3232 - } 3233 - 3234 - free_space = btrfs_leaf_free_space(left); 3235 - if (free_space < data_size) { 3236 - ret = 1; 3237 3235 goto out; 3238 3236 } 3239 3237 ··· 4160 4170 { 4161 4171 struct btrfs_fs_info *fs_info = root->fs_info; 4162 4172 struct extent_buffer *leaf; 4163 - u32 last_off; 4164 - u32 dsize = 0; 4165 4173 int ret = 0; 4166 4174 int wret; 4167 - int i; 4168 4175 u32 nritems; 4169 4176 4170 4177 leaf = path->nodes[0]; 4171 - last_off = btrfs_item_offset(leaf, slot + nr - 1); 4172 - 4173 - for (i = 0; i < nr; i++) 4174 - dsize += btrfs_item_size(leaf, slot + i); 4175 - 4176 4178 nritems = btrfs_header_nritems(leaf); 4177 4179 4178 4180 if (slot + nr != nritems) { 4179 - int data_end = leaf_data_end(leaf); 4181 + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); 4182 + const int data_end = leaf_data_end(leaf); 4180 4183 struct btrfs_map_token token; 4184 + u32 dsize = 0; 4185 + int i; 4186 + 4187 + for (i = 0; i < nr; i++) 4188 + dsize += btrfs_item_size(leaf, slot + i); 4181 4189 4182 4190 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + 4183 4191 data_end + dsize, ··· 4215 4227 fixup_low_keys(path, &disk_key, 1); 4216 4228 } 4217 4229 4218 - /* delete the leaf if it is mostly empty */ 4230 + /* 4231 + * Try to delete the leaf if it is mostly empty. We do this by 4232 + * trying to move all its items into its left and right neighbours. 4233 + * If we can't move all the items, then we don't delete it - it's 4234 + * not ideal, but future insertions might fill the leaf with more 4235 + * items, or items from other leaves might be moved later into our 4236 + * leaf due to deletions on those leaves. 4237 + */ 4219 4238 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { 4239 + u32 min_push_space; 4240 + 4220 4241 /* push_leaf_left fixes the path. 4221 4242 * make sure the path still points to our leaf 4222 4243 * for possible call to del_ptr below 4223 4244 */ 4224 4245 slot = path->slots[1]; 4225 4246 atomic_inc(&leaf->refs); 4226 - 4227 - wret = push_leaf_left(trans, root, path, 1, 1, 4228 - 1, (u32)-1); 4247 + /* 4248 + * We want to be able to at least push one item to the 4249 + * left neighbour leaf, and that's the first item. 4250 + */ 4251 + min_push_space = sizeof(struct btrfs_item) + 4252 + btrfs_item_size(leaf, 0); 4253 + wret = push_leaf_left(trans, root, path, 0, 4254 + min_push_space, 1, (u32)-1); 4229 4255 if (wret < 0 && wret != -ENOSPC) 4230 4256 ret = wret; 4231 4257 4232 4258 if (path->nodes[0] == leaf && 4233 4259 btrfs_header_nritems(leaf)) { 4234 - wret = push_leaf_right(trans, root, path, 1, 4235 - 1, 1, 0); 4260 + /* 4261 + * If we were not able to push all items from our 4262 + * leaf to its left neighbour, then attempt to 4263 + * either push all the remaining items to the 4264 + * right neighbour or none. There's no advantage 4265 + * in pushing only some items, instead of all, as 4266 + * it's pointless to end up with a leaf having 4267 + * too few items while the neighbours can be full 4268 + * or nearly full. 4269 + */ 4270 + nritems = btrfs_header_nritems(leaf); 4271 + min_push_space = leaf_space_used(leaf, 0, nritems); 4272 + wret = push_leaf_right(trans, root, path, 0, 4273 + min_push_space, 1, 0); 4236 4274 if (wret < 0 && wret != -ENOSPC) 4237 4275 ret = wret; 4238 4276 }

+71 -12

fs/btrfs/ctree.h

··· 49 49 struct btrfs_ordered_sum; 50 50 struct btrfs_ref; 51 51 struct btrfs_bio; 52 + struct btrfs_ioctl_encoded_io_args; 52 53 53 54 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ 54 55 ··· 149 148 150 149 /* Indicates there was an error cleaning up a log tree. */ 151 150 BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 151 + 152 + BTRFS_FS_STATE_COUNT 152 153 }; 153 154 154 155 #define BTRFS_BACKREF_REV_MAX 256 ··· 277 274 /* the UUID written into btree blocks */ 278 275 u8 metadata_uuid[BTRFS_FSID_SIZE]; 279 276 277 + /* Extent tree v2 */ 278 + __le64 block_group_root; 279 + __le64 block_group_root_generation; 280 + u8 block_group_root_level; 281 + 280 282 /* future expansion */ 281 - __le64 reserved[28]; 283 + u8 reserved8[7]; 284 + __le64 reserved[25]; 282 285 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 283 286 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; 284 287 ··· 309 300 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL 310 301 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL 311 302 303 + #ifdef CONFIG_BTRFS_DEBUG 304 + /* 305 + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG 306 + */ 307 + #define BTRFS_FEATURE_INCOMPAT_SUPP \ 308 + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 309 + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 310 + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 311 + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 312 + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 313 + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ 314 + BTRFS_FEATURE_INCOMPAT_RAID56 | \ 315 + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ 316 + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ 317 + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ 318 + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ 319 + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ 320 + BTRFS_FEATURE_INCOMPAT_ZONED | \ 321 + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) 322 + #else 312 323 #define BTRFS_FEATURE_INCOMPAT_SUPP \ 313 324 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 314 325 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ ··· 343 314 BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ 344 315 BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ 345 316 BTRFS_FEATURE_INCOMPAT_ZONED) 317 + #endif 346 318 347 319 #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ 348 320 (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) ··· 666 636 struct btrfs_root *quota_root; 667 637 struct btrfs_root *uuid_root; 668 638 struct btrfs_root *data_reloc_root; 639 + struct btrfs_root *block_group_root; 669 640 670 641 /* the log root tree is a directory of all the other log roots */ 671 642 struct btrfs_root *log_root_tree; ··· 1060 1029 */ 1061 1030 spinlock_t relocation_bg_lock; 1062 1031 u64 data_reloc_bg; 1032 + 1033 + u64 nr_global_roots; 1063 1034 1064 1035 spinlock_t zone_active_bgs_lock; 1065 1036 struct list_head zone_active_bgs; ··· 1642 1609 static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ 1643 1610 const type *s) \ 1644 1611 { \ 1645 - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ 1612 + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ 1646 1613 return btrfs_get_##bits(eb, s, offsetof(type, member)); \ 1647 1614 } \ 1648 1615 static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ 1649 1616 u##bits val) \ 1650 1617 { \ 1651 - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ 1618 + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ 1652 1619 btrfs_set_##bits(eb, s, offsetof(type, member), val); \ 1653 1620 } \ 1654 1621 static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ 1655 1622 const type *s) \ 1656 1623 { \ 1657 - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ 1624 + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ 1658 1625 return btrfs_get_token_##bits(token, s, offsetof(type, member));\ 1659 1626 } \ 1660 1627 static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ 1661 1628 type *s, u##bits val) \ 1662 1629 { \ 1663 - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ 1630 + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ 1664 1631 btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ 1665 1632 } 1666 1633 ··· 1691 1658 static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, 1692 1659 struct btrfs_dev_item *s) 1693 1660 { 1694 - BUILD_BUG_ON(sizeof(u64) != 1695 - sizeof(((struct btrfs_dev_item *)0))->total_bytes); 1661 + static_assert(sizeof(u64) == 1662 + sizeof(((struct btrfs_dev_item *)0))->total_bytes); 1696 1663 return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, 1697 1664 total_bytes)); 1698 1665 } ··· 1700 1667 struct btrfs_dev_item *s, 1701 1668 u64 val) 1702 1669 { 1703 - BUILD_BUG_ON(sizeof(u64) != 1704 - sizeof(((struct btrfs_dev_item *)0))->total_bytes); 1670 + static_assert(sizeof(u64) == 1671 + sizeof(((struct btrfs_dev_item *)0))->total_bytes); 1705 1672 WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); 1706 1673 btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); 1707 1674 } ··· 2361 2328 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, 2362 2329 num_devices, 64); 2363 2330 2331 + /* 2332 + * For extent tree v2 we overload the extent root with the block group root, as 2333 + * we will have multiple extent roots. 2334 + */ 2335 + BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, 2336 + extent_root, 64); 2337 + BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, 2338 + extent_root_gen, 64); 2339 + BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, 2340 + struct btrfs_root_backup, extent_root_level, 8); 2341 + 2364 2342 /* struct btrfs_balance_item */ 2365 2343 BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); 2366 2344 ··· 2506 2462 BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); 2507 2463 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, 2508 2464 uuid_tree_generation, 64); 2465 + BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, 2466 + block_group_root, 64); 2467 + BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, 2468 + struct btrfs_super_block, 2469 + block_group_root_generation, 64); 2470 + BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, 2471 + block_group_root_level, 8); 2509 2472 2510 2473 int btrfs_super_csum_size(const struct btrfs_super_block *s); 2511 2474 const char *btrfs_super_csum_name(u16 csum_type); ··· 2890 2839 struct btrfs_block_rsv *rsv); 2891 2840 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); 2892 2841 2893 - int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); 2842 + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, 2843 + u64 disk_num_bytes); 2894 2844 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); 2895 2845 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 2896 2846 u64 start, u64 end); ··· 3207 3155 struct btrfs_root *root, 3208 3156 struct btrfs_ordered_sum *sums); 3209 3157 blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, 3210 - u64 file_start, int contig); 3158 + u64 offset, bool one_ordered); 3211 3159 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3212 3160 struct list_head *list, int search_commit); 3213 3161 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, ··· 3308 3256 void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, 3309 3257 struct page *page, u64 start, 3310 3258 u64 end, bool uptodate); 3259 + ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, 3260 + struct btrfs_ioctl_encoded_io_args *encoded); 3261 + ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, 3262 + const struct btrfs_ioctl_encoded_io_args *encoded); 3263 + 3311 3264 extern const struct dentry_operations btrfs_dentry_operations; 3312 3265 extern const struct iomap_ops btrfs_dio_iomap_ops; 3313 3266 extern const struct iomap_dio_ops btrfs_dio_ops; ··· 3375 3318 struct btrfs_trans_handle **trans_out); 3376 3319 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 3377 3320 struct btrfs_inode *inode, u64 start, u64 end); 3321 + ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, 3322 + const struct btrfs_ioctl_encoded_io_args *encoded); 3378 3323 int btrfs_release_file(struct inode *inode, struct file *file); 3379 3324 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, 3380 3325 size_t num_pages, loff_t pos, size_t write_bytes, ··· 3833 3774 struct btrfs_root *root); 3834 3775 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 3835 3776 struct btrfs_root *root); 3836 - int btrfs_recover_relocation(struct btrfs_root *root); 3777 + int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); 3837 3778 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); 3838 3779 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 3839 3780 struct btrfs_root *root, struct extent_buffer *buf,

+10 -8

fs/btrfs/delalloc-space.c

··· 270 270 } 271 271 272 272 static void calc_inode_reservations(struct btrfs_fs_info *fs_info, 273 - u64 num_bytes, u64 *meta_reserve, 274 - u64 *qgroup_reserve) 273 + u64 num_bytes, u64 disk_num_bytes, 274 + u64 *meta_reserve, u64 *qgroup_reserve) 275 275 { 276 276 u64 nr_extents = count_max_extents(num_bytes); 277 - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); 277 + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); 278 278 u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); 279 279 280 280 *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, ··· 288 288 *qgroup_reserve = nr_extents * fs_info->nodesize; 289 289 } 290 290 291 - int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 291 + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, 292 + u64 disk_num_bytes) 292 293 { 293 294 struct btrfs_root *root = inode->root; 294 295 struct btrfs_fs_info *fs_info = root->fs_info; ··· 319 318 } 320 319 321 320 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 321 + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); 322 322 323 323 /* 324 324 * We always want to do it this way, every other way is wrong and ends ··· 331 329 * everything out and try again, which is bad. This way we just 332 330 * over-reserve slightly, and clean up the mess when we are done. 333 331 */ 334 - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, 335 - &qgroup_reserve); 332 + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, 333 + &meta_reserve, &qgroup_reserve); 336 334 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); 337 335 if (ret) 338 336 return ret; ··· 351 349 spin_lock(&inode->lock); 352 350 nr_extents = count_max_extents(num_bytes); 353 351 btrfs_mod_outstanding_extents(inode, nr_extents); 354 - inode->csum_bytes += num_bytes; 352 + inode->csum_bytes += disk_num_bytes; 355 353 btrfs_calculate_inode_block_rsv_size(fs_info, inode); 356 354 spin_unlock(&inode->lock); 357 355 ··· 456 454 ret = btrfs_check_data_free_space(inode, reserved, start, len); 457 455 if (ret < 0) 458 456 return ret; 459 - ret = btrfs_delalloc_reserve_metadata(inode, len); 457 + ret = btrfs_delalloc_reserve_metadata(inode, len, len); 460 458 if (ret < 0) { 461 459 btrfs_free_reserved_data_space(inode, *reserved, start, len); 462 460 extent_changeset_free(*reserved);

+11 -7

fs/btrfs/dev-replace.c

··· 243 243 struct btrfs_device *srcdev, 244 244 struct btrfs_device **device_out) 245 245 { 246 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 246 247 struct btrfs_device *device; 247 248 struct block_device *bdev; 248 249 struct rcu_string *name; ··· 272 271 273 272 sync_blockdev(bdev); 274 273 275 - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { 274 + list_for_each_entry(device, &fs_devices->devices, dev_list) { 276 275 if (device->bdev == bdev) { 277 276 btrfs_err(fs_info, 278 277 "target device is in the filesystem!"); ··· 303 302 goto error; 304 303 } 305 304 rcu_assign_pointer(device->name, name); 305 + ret = lookup_bdev(device_path, &device->devt); 306 + if (ret) 307 + goto error; 306 308 307 309 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 308 310 device->generation = 0; ··· 324 320 device->mode = FMODE_EXCL; 325 321 device->dev_stats_valid = 1; 326 322 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 327 - device->fs_devices = fs_info->fs_devices; 323 + device->fs_devices = fs_devices; 328 324 329 325 ret = btrfs_get_dev_zone_info(device, false); 330 326 if (ret) 331 327 goto error; 332 328 333 - mutex_lock(&fs_info->fs_devices->device_list_mutex); 334 - list_add(&device->dev_list, &fs_info->fs_devices->devices); 335 - fs_info->fs_devices->num_devices++; 336 - fs_info->fs_devices->open_devices++; 337 - mutex_unlock(&fs_info->fs_devices->device_list_mutex); 329 + mutex_lock(&fs_devices->device_list_mutex); 330 + list_add(&device->dev_list, &fs_devices->devices); 331 + fs_devices->num_devices++; 332 + fs_devices->open_devices++; 333 + mutex_unlock(&fs_devices->device_list_mutex); 338 334 339 335 *device_out = device; 340 336 return 0;

+156 -63

fs/btrfs/disk-io.c

··· 441 441 else 442 442 ret = btrfs_check_leaf_full(eb); 443 443 444 - if (ret < 0) { 445 - btrfs_print_tree(eb, 0); 444 + if (ret < 0) 445 + goto error; 446 + 447 + /* 448 + * Also check the generation, the eb reached here must be newer than 449 + * last committed. Or something seriously wrong happened. 450 + */ 451 + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { 452 + ret = -EUCLEAN; 446 453 btrfs_err(fs_info, 447 - "block=%llu write time tree block corruption detected", 448 - eb->start); 449 - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 450 - return ret; 454 + "block=%llu bad generation, have %llu expect > %llu", 455 + eb->start, btrfs_header_generation(eb), 456 + fs_info->last_trans_committed); 457 + goto error; 451 458 } 452 459 write_extent_buffer(eb, result, 0, fs_info->csum_size); 453 460 454 461 return 0; 462 + 463 + error: 464 + btrfs_print_tree(eb, 0); 465 + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", 466 + eb->start); 467 + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 468 + return ret; 455 469 } 456 470 457 471 /* Checksum all dirty extent buffers in one bio_vec */ ··· 1303 1289 return root; 1304 1290 } 1305 1291 1292 + static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) 1293 + { 1294 + struct btrfs_block_group *block_group; 1295 + u64 ret; 1296 + 1297 + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 1298 + return 0; 1299 + 1300 + if (bytenr) 1301 + block_group = btrfs_lookup_block_group(fs_info, bytenr); 1302 + else 1303 + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); 1304 + ASSERT(block_group); 1305 + if (!block_group) 1306 + return 0; 1307 + ret = block_group->global_root_id; 1308 + btrfs_put_block_group(block_group); 1309 + 1310 + return ret; 1311 + } 1312 + 1306 1313 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) 1307 1314 { 1308 1315 struct btrfs_key key = { 1309 1316 .objectid = BTRFS_CSUM_TREE_OBJECTID, 1310 1317 .type = BTRFS_ROOT_ITEM_KEY, 1311 - .offset = 0, 1318 + .offset = btrfs_global_root_id(fs_info, bytenr), 1312 1319 }; 1313 1320 1314 1321 return btrfs_global_root(fs_info, &key); ··· 1340 1305 struct btrfs_key key = { 1341 1306 .objectid = BTRFS_EXTENT_TREE_OBJECTID, 1342 1307 .type = BTRFS_ROOT_ITEM_KEY, 1343 - .offset = 0, 1308 + .offset = btrfs_global_root_id(fs_info, bytenr), 1344 1309 }; 1345 1310 1346 1311 return btrfs_global_root(fs_info, &key); ··· 1557 1522 ret = PTR_ERR(root->node); 1558 1523 root->node = NULL; 1559 1524 goto fail; 1560 - } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { 1525 + } 1526 + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { 1561 1527 ret = -EIO; 1562 1528 goto fail; 1563 1529 } ··· 1763 1727 btrfs_put_root(fs_info->uuid_root); 1764 1728 btrfs_put_root(fs_info->fs_root); 1765 1729 btrfs_put_root(fs_info->data_reloc_root); 1730 + btrfs_put_root(fs_info->block_group_root); 1766 1731 btrfs_check_leaked_roots(fs_info); 1767 1732 btrfs_extent_buffer_leak_debug_check(fs_info); 1768 1733 kfree(fs_info->super_copy); ··· 1962 1925 1963 1926 static int cleaner_kthread(void *arg) 1964 1927 { 1965 - struct btrfs_root *root = arg; 1966 - struct btrfs_fs_info *fs_info = root->fs_info; 1928 + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg; 1967 1929 int again; 1968 1930 1969 1931 while (1) { ··· 1995 1959 1996 1960 btrfs_run_delayed_iputs(fs_info); 1997 1961 1998 - again = btrfs_clean_one_deleted_snapshot(root); 1962 + again = btrfs_clean_one_deleted_snapshot(fs_info); 1999 1963 mutex_unlock(&fs_info->cleaner_mutex); 2000 1964 2001 1965 /* ··· 2131 2095 { 2132 2096 const int next_backup = info->backup_root_index; 2133 2097 struct btrfs_root_backup *root_backup; 2134 - struct btrfs_root *extent_root = btrfs_extent_root(info, 0); 2135 - struct btrfs_root *csum_root = btrfs_csum_root(info, 0); 2136 2098 2137 2099 root_backup = info->super_for_commit->super_roots + next_backup; 2138 2100 ··· 2155 2121 btrfs_set_backup_chunk_root_level(root_backup, 2156 2122 btrfs_header_level(info->chunk_root->node)); 2157 2123 2158 - btrfs_set_backup_extent_root(root_backup, extent_root->node->start); 2159 - btrfs_set_backup_extent_root_gen(root_backup, 2160 - btrfs_header_generation(extent_root->node)); 2161 - btrfs_set_backup_extent_root_level(root_backup, 2162 - btrfs_header_level(extent_root->node)); 2124 + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { 2125 + btrfs_set_backup_block_group_root(root_backup, 2126 + info->block_group_root->node->start); 2127 + btrfs_set_backup_block_group_root_gen(root_backup, 2128 + btrfs_header_generation(info->block_group_root->node)); 2129 + btrfs_set_backup_block_group_root_level(root_backup, 2130 + btrfs_header_level(info->block_group_root->node)); 2131 + } else { 2132 + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); 2133 + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); 2134 + 2135 + btrfs_set_backup_extent_root(root_backup, 2136 + extent_root->node->start); 2137 + btrfs_set_backup_extent_root_gen(root_backup, 2138 + btrfs_header_generation(extent_root->node)); 2139 + btrfs_set_backup_extent_root_level(root_backup, 2140 + btrfs_header_level(extent_root->node)); 2141 + 2142 + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); 2143 + btrfs_set_backup_csum_root_gen(root_backup, 2144 + btrfs_header_generation(csum_root->node)); 2145 + btrfs_set_backup_csum_root_level(root_backup, 2146 + btrfs_header_level(csum_root->node)); 2147 + } 2163 2148 2164 2149 /* 2165 2150 * we might commit during log recovery, which happens before we set ··· 2198 2145 btrfs_header_generation(info->dev_root->node)); 2199 2146 btrfs_set_backup_dev_root_level(root_backup, 2200 2147 btrfs_header_level(info->dev_root->node)); 2201 - 2202 - btrfs_set_backup_csum_root(root_backup, csum_root->node->start); 2203 - btrfs_set_backup_csum_root_gen(root_backup, 2204 - btrfs_header_generation(csum_root->node)); 2205 - btrfs_set_backup_csum_root_level(root_backup, 2206 - btrfs_header_level(csum_root->node)); 2207 2148 2208 2149 btrfs_set_backup_total_bytes(root_backup, 2209 2150 btrfs_super_total_bytes(info->super_copy)); ··· 2316 2269 free_root_extent_buffers(info->uuid_root); 2317 2270 free_root_extent_buffers(info->fs_root); 2318 2271 free_root_extent_buffers(info->data_reloc_root); 2272 + free_root_extent_buffers(info->block_group_root); 2319 2273 if (free_chunk_root) 2320 2274 free_root_extent_buffers(info->chunk_root); 2321 2275 } ··· 2552 2504 log_tree_root->node = NULL; 2553 2505 btrfs_put_root(log_tree_root); 2554 2506 return ret; 2555 - } else if (!extent_buffer_uptodate(log_tree_root->node)) { 2507 + } 2508 + if (!extent_buffer_uptodate(log_tree_root->node)) { 2556 2509 btrfs_err(fs_info, "failed to read log tree"); 2557 2510 btrfs_put_root(log_tree_root); 2558 2511 return -EIO; 2559 2512 } 2513 + 2560 2514 /* returns with log_tree_root freed on success */ 2561 2515 ret = btrfs_recover_log_trees(log_tree_root); 2562 2516 if (ret) { ··· 2583 2533 { 2584 2534 struct btrfs_fs_info *fs_info = tree_root->fs_info; 2585 2535 struct btrfs_root *root; 2536 + u64 max_global_id = 0; 2586 2537 int ret; 2587 2538 struct btrfs_key key = { 2588 2539 .objectid = objectid, ··· 2619 2568 break; 2620 2569 btrfs_release_path(path); 2621 2570 2571 + /* 2572 + * Just worry about this for extent tree, it'll be the same for 2573 + * everybody. 2574 + */ 2575 + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) 2576 + max_global_id = max(max_global_id, key.offset); 2577 + 2622 2578 found = true; 2623 2579 root = read_tree_root_path(tree_root, path, &key); 2624 2580 if (IS_ERR(root)) { ··· 2642 2584 key.offset++; 2643 2585 } 2644 2586 btrfs_release_path(path); 2587 + 2588 + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) 2589 + fs_info->nr_global_roots = max_global_id + 1; 2645 2590 2646 2591 if (!found || ret) { 2647 2592 if (objectid == BTRFS_CSUM_TREE_OBJECTID) ··· 2991 2930 return ret; 2992 2931 } 2993 2932 2933 + static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) 2934 + { 2935 + int ret = 0; 2936 + 2937 + root->node = read_tree_block(root->fs_info, bytenr, 2938 + root->root_key.objectid, gen, level, NULL); 2939 + if (IS_ERR(root->node)) { 2940 + ret = PTR_ERR(root->node); 2941 + root->node = NULL; 2942 + return ret; 2943 + } 2944 + if (!extent_buffer_uptodate(root->node)) { 2945 + free_extent_buffer(root->node); 2946 + root->node = NULL; 2947 + return -EIO; 2948 + } 2949 + 2950 + btrfs_set_root_node(&root->root_item, root->node); 2951 + root->commit_root = btrfs_root_node(root); 2952 + btrfs_set_root_refs(&root->root_item, 1); 2953 + return ret; 2954 + } 2955 + 2956 + static int load_important_roots(struct btrfs_fs_info *fs_info) 2957 + { 2958 + struct btrfs_super_block *sb = fs_info->super_copy; 2959 + u64 gen, bytenr; 2960 + int level, ret; 2961 + 2962 + bytenr = btrfs_super_root(sb); 2963 + gen = btrfs_super_generation(sb); 2964 + level = btrfs_super_root_level(sb); 2965 + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); 2966 + if (ret) { 2967 + btrfs_warn(fs_info, "couldn't read tree root"); 2968 + return ret; 2969 + } 2970 + 2971 + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 2972 + return 0; 2973 + 2974 + bytenr = btrfs_super_block_group_root(sb); 2975 + gen = btrfs_super_block_group_root_generation(sb); 2976 + level = btrfs_super_block_group_root_level(sb); 2977 + ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); 2978 + if (ret) 2979 + btrfs_warn(fs_info, "couldn't read block group root"); 2980 + return ret; 2981 + } 2982 + 2994 2983 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) 2995 2984 { 2996 2985 int backup_index = find_newest_super_backup(fs_info); ··· 3050 2939 int ret = 0; 3051 2940 int i; 3052 2941 3053 - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { 3054 - u64 generation; 3055 - int level; 2942 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2943 + struct btrfs_root *root; 3056 2944 2945 + root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, 2946 + GFP_KERNEL); 2947 + if (!root) 2948 + return -ENOMEM; 2949 + fs_info->block_group_root = root; 2950 + } 2951 + 2952 + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { 3057 2953 if (handle_error) { 3058 2954 if (!IS_ERR(tree_root->node)) 3059 2955 free_extent_buffer(tree_root->node); ··· 3085 2967 if (ret < 0) 3086 2968 return ret; 3087 2969 } 3088 - generation = btrfs_super_generation(sb); 3089 - level = btrfs_super_root_level(sb); 3090 - tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), 3091 - BTRFS_ROOT_TREE_OBJECTID, 3092 - generation, level, NULL); 3093 - if (IS_ERR(tree_root->node)) { 3094 - handle_error = true; 3095 - ret = PTR_ERR(tree_root->node); 3096 - tree_root->node = NULL; 3097 - btrfs_warn(fs_info, "couldn't read tree root"); 3098 - continue; 3099 2970 3100 - } else if (!extent_buffer_uptodate(tree_root->node)) { 2971 + ret = load_important_roots(fs_info); 2972 + if (ret) { 3101 2973 handle_error = true; 3102 - ret = -EIO; 3103 - btrfs_warn(fs_info, "error while reading tree root"); 3104 2974 continue; 3105 2975 } 3106 - 3107 - btrfs_set_root_node(&tree_root->root_item, tree_root->node); 3108 - tree_root->commit_root = btrfs_root_node(tree_root); 3109 - btrfs_set_root_refs(&tree_root->root_item, 1); 3110 2976 3111 2977 /* 3112 2978 * No need to hold btrfs_root::objectid_mutex since the fs ··· 3111 3009 } 3112 3010 3113 3011 /* All successful */ 3114 - fs_info->generation = generation; 3115 - fs_info->last_trans_committed = generation; 3012 + fs_info->generation = btrfs_header_generation(tree_root->node); 3013 + fs_info->last_trans_committed = fs_info->generation; 3116 3014 fs_info->last_reloc_trans = 0; 3117 3015 3118 3016 /* Always begin writing backup roots after the one being used */ ··· 3395 3293 up_read(&fs_info->cleanup_work_sem); 3396 3294 3397 3295 mutex_lock(&fs_info->cleaner_mutex); 3398 - ret = btrfs_recover_relocation(fs_info->tree_root); 3296 + ret = btrfs_recover_relocation(fs_info); 3399 3297 mutex_unlock(&fs_info->cleaner_mutex); 3400 3298 if (ret < 0) { 3401 3299 btrfs_warn(fs_info, "failed to recover relocation: %d", ret); ··· 3696 3594 3697 3595 generation = btrfs_super_chunk_root_generation(disk_super); 3698 3596 level = btrfs_super_chunk_root_level(disk_super); 3699 - 3700 - chunk_root->node = read_tree_block(fs_info, 3701 - btrfs_super_chunk_root(disk_super), 3702 - BTRFS_CHUNK_TREE_OBJECTID, 3703 - generation, level, NULL); 3704 - if (IS_ERR(chunk_root->node) || 3705 - !extent_buffer_uptodate(chunk_root->node)) { 3597 + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), 3598 + generation, level); 3599 + if (ret) { 3706 3600 btrfs_err(fs_info, "failed to read chunk root"); 3707 - if (!IS_ERR(chunk_root->node)) 3708 - free_extent_buffer(chunk_root->node); 3709 - chunk_root->node = NULL; 3710 3601 goto fail_tree_roots; 3711 3602 } 3712 - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 3713 - chunk_root->commit_root = btrfs_root_node(chunk_root); 3714 3603 3715 3604 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 3716 3605 offsetof(struct btrfs_header, chunk_tree_uuid), ··· 3821 3728 goto fail_sysfs; 3822 3729 } 3823 3730 3824 - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 3731 + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, 3825 3732 "btrfs-cleaner"); 3826 3733 if (IS_ERR(fs_info->cleaner_kthread)) 3827 3734 goto fail_sysfs;

+2

fs/btrfs/disk-io.h

··· 111 111 112 112 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) 113 113 { 114 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 115 + return fs_info->block_group_root; 114 116 return btrfs_extent_root(fs_info, 0); 115 117 } 116 118

+68 -80

fs/btrfs/extent-tree.c

··· 598 598 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 599 599 struct btrfs_root *root, 600 600 struct btrfs_path *path, 601 - int refs_to_drop, int *last_ref) 601 + int refs_to_drop) 602 602 { 603 603 struct btrfs_key key; 604 604 struct btrfs_extent_data_ref *ref1 = NULL; ··· 631 631 632 632 if (num_refs == 0) { 633 633 ret = btrfs_del_item(trans, root, path); 634 - *last_ref = 1; 635 634 } else { 636 635 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 637 636 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); ··· 1071 1072 void update_inline_extent_backref(struct btrfs_path *path, 1072 1073 struct btrfs_extent_inline_ref *iref, 1073 1074 int refs_to_mod, 1074 - struct btrfs_delayed_extent_op *extent_op, 1075 - int *last_ref) 1075 + struct btrfs_delayed_extent_op *extent_op) 1076 1076 { 1077 1077 struct extent_buffer *leaf = path->nodes[0]; 1078 1078 struct btrfs_extent_item *ei; ··· 1119 1121 else 1120 1122 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1121 1123 } else { 1122 - *last_ref = 1; 1123 1124 size = btrfs_extent_inline_ref_size(type); 1124 1125 item_size = btrfs_item_size(leaf, path->slots[0]); 1125 1126 ptr = (unsigned long)iref; ··· 1163 1166 } 1164 1167 return -EUCLEAN; 1165 1168 } 1166 - update_inline_extent_backref(path, iref, refs_to_add, 1167 - extent_op, NULL); 1169 + update_inline_extent_backref(path, iref, refs_to_add, extent_op); 1168 1170 } else if (ret == -ENOENT) { 1169 1171 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1170 1172 root_objectid, owner, offset, ··· 1177 1181 struct btrfs_root *root, 1178 1182 struct btrfs_path *path, 1179 1183 struct btrfs_extent_inline_ref *iref, 1180 - int refs_to_drop, int is_data, int *last_ref) 1184 + int refs_to_drop, int is_data) 1181 1185 { 1182 1186 int ret = 0; 1183 1187 1184 1188 BUG_ON(!is_data && refs_to_drop != 1); 1185 - if (iref) { 1186 - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, 1187 - last_ref); 1188 - } else if (is_data) { 1189 - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1190 - last_ref); 1191 - } else { 1192 - *last_ref = 1; 1189 + if (iref) 1190 + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); 1191 + else if (is_data) 1192 + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1193 + else 1193 1194 ret = btrfs_del_item(trans, root, path); 1194 - } 1195 1195 return ret; 1196 1196 } 1197 1197 ··· 2758 2766 spin_unlock(&cache->lock); 2759 2767 if (!readonly && return_free_space && 2760 2768 global_rsv->space_info == space_info) { 2761 - u64 to_add = len; 2762 - 2763 2769 spin_lock(&global_rsv->lock); 2764 2770 if (!global_rsv->full) { 2765 - to_add = min(len, global_rsv->size - 2766 - global_rsv->reserved); 2771 + u64 to_add = min(len, global_rsv->size - 2772 + global_rsv->reserved); 2773 + 2767 2774 global_rsv->reserved += to_add; 2768 2775 btrfs_space_info_update_bytes_may_use(fs_info, 2769 2776 space_info, to_add); ··· 2853 2862 return 0; 2854 2863 } 2855 2864 2865 + static int do_free_extent_accounting(struct btrfs_trans_handle *trans, 2866 + u64 bytenr, u64 num_bytes, bool is_data) 2867 + { 2868 + int ret; 2869 + 2870 + if (is_data) { 2871 + struct btrfs_root *csum_root; 2872 + 2873 + csum_root = btrfs_csum_root(trans->fs_info, bytenr); 2874 + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); 2875 + if (ret) { 2876 + btrfs_abort_transaction(trans, ret); 2877 + return ret; 2878 + } 2879 + } 2880 + 2881 + ret = add_to_free_space_tree(trans, bytenr, num_bytes); 2882 + if (ret) { 2883 + btrfs_abort_transaction(trans, ret); 2884 + return ret; 2885 + } 2886 + 2887 + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 2888 + if (ret) 2889 + btrfs_abort_transaction(trans, ret); 2890 + 2891 + return ret; 2892 + } 2893 + 2856 2894 /* 2857 2895 * Drop one or more refs of @node. 2858 2896 * ··· 2963 2943 u64 refs; 2964 2944 u64 bytenr = node->bytenr; 2965 2945 u64 num_bytes = node->num_bytes; 2966 - int last_ref = 0; 2967 2946 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 2968 2947 2969 2948 extent_root = btrfs_extent_root(info, bytenr); ··· 3029 3010 } 3030 3011 /* Must be SHARED_* item, remove the backref first */ 3031 3012 ret = remove_extent_backref(trans, extent_root, path, 3032 - NULL, refs_to_drop, is_data, 3033 - &last_ref); 3013 + NULL, refs_to_drop, is_data); 3034 3014 if (ret) { 3035 3015 btrfs_abort_transaction(trans, ret); 3036 3016 goto out; ··· 3154 3136 } 3155 3137 if (found_extent) { 3156 3138 ret = remove_extent_backref(trans, extent_root, path, 3157 - iref, refs_to_drop, is_data, 3158 - &last_ref); 3139 + iref, refs_to_drop, is_data); 3159 3140 if (ret) { 3160 3141 btrfs_abort_transaction(trans, ret); 3161 3142 goto out; ··· 3199 3182 } 3200 3183 } 3201 3184 3202 - last_ref = 1; 3203 3185 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3204 3186 num_to_del); 3205 3187 if (ret) { ··· 3207 3191 } 3208 3192 btrfs_release_path(path); 3209 3193 3210 - if (is_data) { 3211 - struct btrfs_root *csum_root; 3212 - csum_root = btrfs_csum_root(info, bytenr); 3213 - ret = btrfs_del_csums(trans, csum_root, bytenr, 3214 - num_bytes); 3215 - if (ret) { 3216 - btrfs_abort_transaction(trans, ret); 3217 - goto out; 3218 - } 3219 - } 3220 - 3221 - ret = add_to_free_space_tree(trans, bytenr, num_bytes); 3222 - if (ret) { 3223 - btrfs_abort_transaction(trans, ret); 3224 - goto out; 3225 - } 3226 - 3227 - ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 3228 - if (ret) { 3229 - btrfs_abort_transaction(trans, ret); 3230 - goto out; 3231 - } 3194 + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); 3232 3195 } 3233 3196 btrfs_release_path(path); 3234 3197 ··· 4600 4605 return ret; 4601 4606 } 4602 4607 4608 + static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, 4609 + u64 num_bytes) 4610 + { 4611 + struct btrfs_fs_info *fs_info = trans->fs_info; 4612 + int ret; 4613 + 4614 + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); 4615 + if (ret) 4616 + return ret; 4617 + 4618 + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); 4619 + if (ret) { 4620 + ASSERT(!ret); 4621 + btrfs_err(fs_info, "update block group failed for %llu %llu", 4622 + bytenr, num_bytes); 4623 + return ret; 4624 + } 4625 + 4626 + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); 4627 + return 0; 4628 + } 4629 + 4603 4630 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4604 4631 u64 parent, u64 root_objectid, 4605 4632 u64 flags, u64 owner, u64 offset, ··· 4682 4665 btrfs_mark_buffer_dirty(path->nodes[0]); 4683 4666 btrfs_free_path(path); 4684 4667 4685 - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); 4686 - if (ret) 4687 - return ret; 4688 - 4689 - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); 4690 - if (ret) { /* -ENOENT, logic error */ 4691 - btrfs_err(fs_info, "update block group failed for %llu %llu", 4692 - ins->objectid, ins->offset); 4693 - BUG(); 4694 - } 4695 - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 4696 - return ret; 4668 + return alloc_reserved_extent(trans, ins->objectid, ins->offset); 4697 4669 } 4698 4670 4699 4671 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ··· 4700 4694 struct extent_buffer *leaf; 4701 4695 struct btrfs_delayed_tree_ref *ref; 4702 4696 u32 size = sizeof(*extent_item) + sizeof(*iref); 4703 - u64 num_bytes; 4704 4697 u64 flags = extent_op->flags_to_set; 4705 4698 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4706 4699 ··· 4709 4704 if (skinny_metadata) { 4710 4705 extent_key.offset = ref->level; 4711 4706 extent_key.type = BTRFS_METADATA_ITEM_KEY; 4712 - num_bytes = fs_info->nodesize; 4713 4707 } else { 4714 4708 extent_key.offset = node->num_bytes; 4715 4709 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 4716 4710 size += sizeof(*block_info); 4717 - num_bytes = node->num_bytes; 4718 4711 } 4719 4712 4720 4713 path = btrfs_alloc_path(); ··· 4757 4754 btrfs_mark_buffer_dirty(leaf); 4758 4755 btrfs_free_path(path); 4759 4756 4760 - ret = remove_from_free_space_tree(trans, extent_key.objectid, 4761 - num_bytes); 4762 - if (ret) 4763 - return ret; 4764 - 4765 - ret = btrfs_update_block_group(trans, extent_key.objectid, 4766 - fs_info->nodesize, true); 4767 - if (ret) { /* -ENOENT, logic error */ 4768 - btrfs_err(fs_info, "update block group failed for %llu %llu", 4769 - extent_key.objectid, extent_key.offset); 4770 - BUG(); 4771 - } 4772 - 4773 - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, 4774 - fs_info->nodesize); 4775 - return ret; 4757 + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); 4776 4758 } 4777 4759 4778 4760 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,

+26 -19

fs/btrfs/extent_io.c

··· 2610 2610 * a good copy of the failed sector and if we succeed, we have setup 2611 2611 * everything for repair_io_failure to do the rest for us. 2612 2612 */ 2613 + ASSERT(failed_mirror); 2613 2614 failrec->failed_mirror = failed_mirror; 2614 2615 failrec->this_mirror++; 2615 2616 if (failrec->this_mirror == failed_mirror) ··· 2640 2639 const int icsum = bio_offset >> fs_info->sectorsize_bits; 2641 2640 struct bio *repair_bio; 2642 2641 struct btrfs_bio *repair_bbio; 2643 - blk_status_t status; 2644 2642 2645 2643 btrfs_debug(fs_info, 2646 2644 "repair read error: read error at %llu", start); ··· 2678 2678 "repair read error: submitting new read to mirror %d", 2679 2679 failrec->this_mirror); 2680 2680 2681 - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, 2682 - failrec->bio_flags); 2683 - if (status) { 2684 - free_io_failure(failure_tree, tree, failrec); 2685 - bio_put(repair_bio); 2686 - } 2687 - return blk_status_to_errno(status); 2681 + /* 2682 + * At this point we have a bio, so any errors from submit_bio_hook() 2683 + * will be handled by the endio on the repair_bio, so we can't return an 2684 + * error here. 2685 + */ 2686 + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); 2687 + return BLK_STS_OK; 2688 2688 } 2689 2689 2690 2690 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) ··· 3067 3067 goto readpage_ok; 3068 3068 3069 3069 if (is_data_inode(inode)) { 3070 + /* 3071 + * If we failed to submit the IO at all we'll have a 3072 + * mirror_num == 0, in which case we need to just mark 3073 + * the page with an error and unlock it and carry on. 3074 + */ 3075 + if (mirror == 0) 3076 + goto readpage_ok; 3077 + 3070 3078 /* 3071 3079 * btrfs_submit_read_repair() will handle all the good 3072 3080 * and bad sectors, we just continue to the next bvec. ··· 3542 3534 } 3543 3535 3544 3536 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); 3545 - if (em_cached && !IS_ERR_OR_NULL(em)) { 3537 + if (em_cached && !IS_ERR(em)) { 3546 3538 BUG_ON(*em_cached); 3547 3539 refcount_inc(&em->refs); 3548 3540 *em_cached = em; ··· 3571 3563 u64 cur_end; 3572 3564 struct extent_map *em; 3573 3565 int ret = 0; 3574 - int nr = 0; 3575 3566 size_t pg_offset = 0; 3576 3567 size_t iosize; 3577 3568 size_t blocksize = inode->i_sb->s_blocksize; ··· 3615 3608 } 3616 3609 em = __get_extent_map(inode, page, pg_offset, cur, 3617 3610 end - cur + 1, em_cached); 3618 - if (IS_ERR_OR_NULL(em)) { 3611 + if (IS_ERR(em)) { 3619 3612 unlock_extent(tree, cur, end); 3620 3613 end_page_read(page, false, cur, end + 1 - cur); 3614 + ret = PTR_ERR(em); 3621 3615 break; 3622 3616 } 3623 3617 extent_offset = cur - em->start; ··· 3729 3721 end_bio_extent_readpage, 0, 3730 3722 this_bio_flag, 3731 3723 force_bio_submit); 3732 - if (!ret) { 3733 - nr++; 3734 - } else { 3724 + if (ret) { 3735 3725 unlock_extent(tree, cur, cur + iosize - 1); 3736 3726 end_page_read(page, false, cur, iosize); 3737 3727 goto out; ··· 3957 3951 } 3958 3952 3959 3953 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); 3960 - if (IS_ERR_OR_NULL(em)) { 3954 + if (IS_ERR(em)) { 3961 3955 btrfs_page_set_error(fs_info, page, cur, end - cur + 1); 3962 3956 ret = PTR_ERR_OR_ZERO(em); 3963 3957 break; ··· 4786 4780 return ret; 4787 4781 } 4788 4782 if (cache) { 4789 - /* Impiles write in zoned mode */ 4790 - btrfs_put_block_group(cache); 4791 - /* Mark the last eb in a block group */ 4783 + /* 4784 + * Implies write in zoned mode. Mark the last eb in a block group. 4785 + */ 4792 4786 if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) 4793 4787 set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); 4788 + btrfs_put_block_group(cache); 4794 4789 } 4795 4790 ret = write_one_eb(eb, wbc, epd); 4796 4791 free_extent_buffer(eb); ··· 5397 5390 break; 5398 5391 len = ALIGN(len, sectorsize); 5399 5392 em = btrfs_get_extent_fiemap(inode, offset, len); 5400 - if (IS_ERR_OR_NULL(em)) 5393 + if (IS_ERR(em)) 5401 5394 return em; 5402 5395 5403 5396 /* if this isn't a hole return it */

+4

fs/btrfs/extent_map.c

··· 492 492 */ 493 493 void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) 494 494 { 495 + lockdep_assert_held_write(&tree->lock); 496 + 495 497 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); 496 498 rb_erase_cached(&em->rb_node, &tree->map); 497 499 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) ··· 508 506 struct extent_map *new, 509 507 int modified) 510 508 { 509 + lockdep_assert_held_write(&tree->lock); 510 + 511 511 WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); 512 512 ASSERT(extent_map_in_tree(cur)); 513 513 if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))

+41 -35

fs/btrfs/file-item.c

··· 305 305 read_extent_buffer(path->nodes[0], dst, (unsigned long)item, 306 306 ret * csum_size); 307 307 out: 308 - if (ret == -ENOENT) 308 + if (ret == -ENOENT || ret == -EFBIG) 309 309 ret = 0; 310 310 return ret; 311 311 } ··· 368 368 { 369 369 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 370 370 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 371 + struct btrfs_bio *bbio = NULL; 371 372 struct btrfs_path *path; 372 373 const u32 sectorsize = fs_info->sectorsize; 373 374 const u32 csum_size = fs_info->csum_size; ··· 378 377 u8 *csum; 379 378 const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; 380 379 int count = 0; 380 + blk_status_t ret = BLK_STS_OK; 381 381 382 382 if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || 383 383 test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) ··· 402 400 return BLK_STS_RESOURCE; 403 401 404 402 if (!dst) { 405 - struct btrfs_bio *bbio = btrfs_bio(bio); 403 + bbio = btrfs_bio(bio); 406 404 407 405 if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { 408 406 bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); ··· 458 456 459 457 count = search_csum_tree(fs_info, path, cur_disk_bytenr, 460 458 search_len, csum_dst); 461 - if (count <= 0) { 462 - /* 463 - * Either we hit a critical error or we didn't find 464 - * the csum. 465 - * Either way, we put zero into the csums dst, and skip 466 - * to the next sector. 467 - */ 459 + if (count < 0) { 460 + ret = errno_to_blk_status(count); 461 + if (bbio) 462 + btrfs_bio_free_csum(bbio); 463 + break; 464 + } 465 + 466 + /* 467 + * We didn't find a csum for this range. We need to make sure 468 + * we complain loudly about this, because we are not NODATASUM. 469 + * 470 + * However for the DATA_RELOC inode we could potentially be 471 + * relocating data extents for a NODATASUM inode, so the inode 472 + * itself won't be marked with NODATASUM, but the extent we're 473 + * copying is in fact NODATASUM. If we don't find a csum we 474 + * assume this is the case. 475 + */ 476 + if (count == 0) { 468 477 memset(csum_dst, 0, csum_size); 469 478 count = 1; 470 479 471 - /* 472 - * For data reloc inode, we need to mark the range 473 - * NODATASUM so that balance won't report false csum 474 - * error. 475 - */ 476 480 if (BTRFS_I(inode)->root->root_key.objectid == 477 481 BTRFS_DATA_RELOC_TREE_OBJECTID) { 478 482 u64 file_offset; ··· 499 491 } 500 492 501 493 btrfs_free_path(path); 502 - return BLK_STS_OK; 494 + return ret; 503 495 } 504 496 505 497 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, ··· 620 612 return ret; 621 613 } 622 614 623 - /* 624 - * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio 615 + /** 616 + * Calculate checksums of the data contained inside a bio 617 + * 625 618 * @inode: Owner of the data inside the bio 626 619 * @bio: Contains the data to be checksummed 627 - * @file_start: offset in file this bio begins to describe 628 - * @contig: Boolean. If true/1 means all bio vecs in this bio are 629 - * contiguous and they begin at @file_start in the file. False/0 630 - * means this bio can contain potentially discontiguous bio vecs 631 - * so the logical offset of each should be calculated separately. 620 + * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the 621 + * file offsets are determined from the page offsets in the bio. 622 + * Otherwise, this is the starting file offset of the bio vecs in 623 + * @bio, which must be contiguous. 624 + * @one_ordered: If true, @bio only refers to one ordered extent. 632 625 */ 633 626 blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, 634 - u64 file_start, int contig) 627 + u64 offset, bool one_ordered) 635 628 { 636 629 struct btrfs_fs_info *fs_info = inode->root->fs_info; 637 630 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 638 631 struct btrfs_ordered_sum *sums; 639 632 struct btrfs_ordered_extent *ordered = NULL; 633 + const bool use_page_offsets = (offset == (u64)-1); 640 634 char *data; 641 635 struct bvec_iter iter; 642 636 struct bio_vec bvec; 643 637 int index; 644 - int nr_sectors; 638 + unsigned int blockcount; 645 639 unsigned long total_bytes = 0; 646 640 unsigned long this_sum_bytes = 0; 647 641 int i; 648 - u64 offset; 649 642 unsigned nofs_flag; 650 643 651 644 nofs_flag = memalloc_nofs_save(); ··· 660 651 sums->len = bio->bi_iter.bi_size; 661 652 INIT_LIST_HEAD(&sums->list); 662 653 663 - if (contig) 664 - offset = file_start; 665 - else 666 - offset = 0; /* shut up gcc */ 667 - 668 654 sums->bytenr = bio->bi_iter.bi_sector << 9; 669 655 index = 0; 670 656 671 657 shash->tfm = fs_info->csum_shash; 672 658 673 659 bio_for_each_segment(bvec, bio, iter) { 674 - if (!contig) 660 + if (use_page_offsets) 675 661 offset = page_offset(bvec.bv_page) + bvec.bv_offset; 676 662 677 663 if (!ordered) { ··· 685 681 } 686 682 } 687 683 688 - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, 684 + blockcount = BTRFS_BYTES_TO_BLKS(fs_info, 689 685 bvec.bv_len + fs_info->sectorsize 690 686 - 1); 691 687 692 - for (i = 0; i < nr_sectors; i++) { 693 - if (offset >= ordered->file_offset + ordered->num_bytes || 694 - offset < ordered->file_offset) { 688 + for (i = 0; i < blockcount; i++) { 689 + if (!one_ordered && 690 + !in_range(offset, ordered->file_offset, 691 + ordered->num_bytes)) { 695 692 unsigned long bytes_left; 696 693 697 694 sums->len = this_sum_bytes; ··· 1216 1211 extent_start = key.offset; 1217 1212 extent_end = btrfs_file_extent_end(path); 1218 1213 em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1214 + em->generation = btrfs_file_extent_generation(leaf, fi); 1219 1215 if (type == BTRFS_FILE_EXTENT_REG || 1220 1216 type == BTRFS_FILE_EXTENT_PREALLOC) { 1221 1217 em->start = extent_start;

+59 -20

fs/btrfs/file.c

··· 691 691 int modify_tree = -1; 692 692 int update_refs; 693 693 int found = 0; 694 - int leafs_visited = 0; 695 694 struct btrfs_path *path = args->path; 696 695 697 696 args->bytes_found = 0; ··· 728 729 path->slots[0]--; 729 730 } 730 731 ret = 0; 731 - leafs_visited++; 732 732 next_slot: 733 733 leaf = path->nodes[0]; 734 734 if (path->slots[0] >= btrfs_header_nritems(leaf)) { ··· 739 741 ret = 0; 740 742 break; 741 743 } 742 - leafs_visited++; 743 744 leaf = path->nodes[0]; 744 745 recow = 1; 745 746 } ··· 984 987 * which case it unlocked our path, so check path->locks[0] matches a 985 988 * write lock. 986 989 */ 987 - if (!ret && args->replace_extent && leafs_visited == 1 && 990 + if (!ret && args->replace_extent && 988 991 path->locks[0] == BTRFS_WRITE_LOCK && 989 992 btrfs_leaf_free_space(leaf) >= 990 993 sizeof(struct btrfs_item) + args->extent_item_size) { ··· 1719 1722 fs_info->sectorsize); 1720 1723 WARN_ON(reserve_bytes == 0); 1721 1724 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 1722 - reserve_bytes); 1725 + reserve_bytes, 1726 + reserve_bytes); 1723 1727 if (ret) { 1724 1728 if (!only_release_metadata) 1725 1729 btrfs_free_reserved_data_space(BTRFS_I(inode), ··· 2037 2039 return err < 0 ? err : written; 2038 2040 } 2039 2041 2040 - static ssize_t btrfs_file_write_iter(struct kiocb *iocb, 2041 - struct iov_iter *from) 2042 + static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 2043 + const struct btrfs_ioctl_encoded_io_args *encoded) 2044 + { 2045 + struct file *file = iocb->ki_filp; 2046 + struct inode *inode = file_inode(file); 2047 + loff_t count; 2048 + ssize_t ret; 2049 + 2050 + btrfs_inode_lock(inode, 0); 2051 + count = encoded->len; 2052 + ret = generic_write_checks_count(iocb, &count); 2053 + if (ret == 0 && count != encoded->len) { 2054 + /* 2055 + * The write got truncated by generic_write_checks_count(). We 2056 + * can't do a partial encoded write. 2057 + */ 2058 + ret = -EFBIG; 2059 + } 2060 + if (ret || encoded->len == 0) 2061 + goto out; 2062 + 2063 + ret = btrfs_write_check(iocb, from, encoded->len); 2064 + if (ret < 0) 2065 + goto out; 2066 + 2067 + ret = btrfs_do_encoded_write(iocb, from, encoded); 2068 + out: 2069 + btrfs_inode_unlock(inode, 0); 2070 + return ret; 2071 + } 2072 + 2073 + ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, 2074 + const struct btrfs_ioctl_encoded_io_args *encoded) 2042 2075 { 2043 2076 struct file *file = iocb->ki_filp; 2044 2077 struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 2045 - ssize_t num_written = 0; 2078 + ssize_t num_written, num_sync; 2046 2079 const bool sync = iocb->ki_flags & IOCB_DSYNC; 2047 2080 2048 2081 /* ··· 2084 2055 if (BTRFS_FS_ERROR(inode->root->fs_info)) 2085 2056 return -EROFS; 2086 2057 2087 - if (!(iocb->ki_flags & IOCB_DIRECT) && 2088 - (iocb->ki_flags & IOCB_NOWAIT)) 2058 + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 2089 2059 return -EOPNOTSUPP; 2090 2060 2091 2061 if (sync) 2092 2062 atomic_inc(&inode->sync_writers); 2093 2063 2094 - if (iocb->ki_flags & IOCB_DIRECT) 2095 - num_written = btrfs_direct_write(iocb, from); 2096 - else 2097 - num_written = btrfs_buffered_write(iocb, from); 2064 + if (encoded) { 2065 + num_written = btrfs_encoded_write(iocb, from, encoded); 2066 + num_sync = encoded->len; 2067 + } else if (iocb->ki_flags & IOCB_DIRECT) { 2068 + num_written = num_sync = btrfs_direct_write(iocb, from); 2069 + } else { 2070 + num_written = num_sync = btrfs_buffered_write(iocb, from); 2071 + } 2098 2072 2099 2073 btrfs_set_inode_last_sub_trans(inode); 2100 2074 2101 - if (num_written > 0) 2102 - num_written = generic_write_sync(iocb, num_written); 2075 + if (num_sync > 0) { 2076 + num_sync = generic_write_sync(iocb, num_sync); 2077 + if (num_sync < 0) 2078 + num_written = num_sync; 2079 + } 2103 2080 2104 2081 if (sync) 2105 2082 atomic_dec(&inode->sync_writers); 2106 2083 2107 2084 current->backing_dev_info = NULL; 2108 2085 return num_written; 2086 + } 2087 + 2088 + static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 2089 + { 2090 + return btrfs_do_write_iter(iocb, from, NULL); 2109 2091 } 2110 2092 2111 2093 int btrfs_release_file(struct inode *inode, struct file *filp) ··· 2514 2474 hole_em = alloc_extent_map(); 2515 2475 if (!hole_em) { 2516 2476 btrfs_drop_extent_cache(inode, offset, end - 1, 0); 2517 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 2477 + btrfs_set_inode_full_sync(inode); 2518 2478 } else { 2519 2479 hole_em->start = offset; 2520 2480 hole_em->len = end - offset; ··· 2535 2495 } while (ret == -EEXIST); 2536 2496 free_extent_map(hole_em); 2537 2497 if (ret) 2538 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2539 - &inode->runtime_flags); 2498 + btrfs_set_inode_full_sync(inode); 2540 2499 } 2541 2500 2542 2501 return 0; ··· 2889 2850 * maps for the replacement extents (or holes). 2890 2851 */ 2891 2852 if (extent_info && !extent_info->is_new_extent) 2892 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 2853 + btrfs_set_inode_full_sync(inode); 2893 2854 2894 2855 if (ret) 2895 2856 goto out_trans;

+2

fs/btrfs/free-space-tree.c

··· 25 25 .offset = 0, 26 26 }; 27 27 28 + if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2)) 29 + key.offset = block_group->global_root_id; 28 30 return btrfs_global_root(block_group->fs_info, &key); 29 31 } 30 32

+938 -245

fs/btrfs/inode.c

··· 66 66 struct extent_changeset *data_reserved; 67 67 }; 68 68 69 + struct btrfs_rename_ctx { 70 + /* Output field. Stores the index number of the old directory entry. */ 71 + u64 index; 72 + }; 73 + 69 74 static const struct inode_operations btrfs_dir_inode_operations; 70 75 static const struct inode_operations btrfs_symlink_inode_operations; 71 76 static const struct inode_operations btrfs_special_inode_operations; ··· 239 234 * no overlapping inline items exist in the btree 240 235 */ 241 236 static int insert_inline_extent(struct btrfs_trans_handle *trans, 242 - struct btrfs_path *path, bool extent_inserted, 243 - struct btrfs_root *root, struct inode *inode, 244 - u64 start, size_t size, size_t compressed_size, 237 + struct btrfs_path *path, 238 + struct btrfs_inode *inode, bool extent_inserted, 239 + size_t size, size_t compressed_size, 245 240 int compress_type, 246 - struct page **compressed_pages) 241 + struct page **compressed_pages, 242 + bool update_i_size) 247 243 { 244 + struct btrfs_root *root = inode->root; 248 245 struct extent_buffer *leaf; 249 246 struct page *page = NULL; 250 247 char *kaddr; ··· 254 247 struct btrfs_file_extent_item *ei; 255 248 int ret; 256 249 size_t cur_size = size; 257 - unsigned long offset; 250 + u64 i_size; 258 251 259 252 ASSERT((compressed_size > 0 && compressed_pages) || 260 253 (compressed_size == 0 && !compressed_pages)); ··· 266 259 struct btrfs_key key; 267 260 size_t datasize; 268 261 269 - key.objectid = btrfs_ino(BTRFS_I(inode)); 270 - key.offset = start; 262 + key.objectid = btrfs_ino(inode); 263 + key.offset = 0; 271 264 key.type = BTRFS_EXTENT_DATA_KEY; 272 265 273 266 datasize = btrfs_file_extent_calc_inline_size(cur_size); ··· 305 298 btrfs_set_file_extent_compression(leaf, ei, 306 299 compress_type); 307 300 } else { 308 - page = find_get_page(inode->i_mapping, 309 - start >> PAGE_SHIFT); 301 + page = find_get_page(inode->vfs_inode.i_mapping, 0); 310 302 btrfs_set_file_extent_compression(leaf, ei, 0); 311 303 kaddr = kmap_atomic(page); 312 - offset = offset_in_page(start); 313 - write_extent_buffer(leaf, kaddr + offset, ptr, size); 304 + write_extent_buffer(leaf, kaddr, ptr, size); 314 305 kunmap_atomic(kaddr); 315 306 put_page(page); 316 307 } ··· 319 314 * We align size to sectorsize for inline extents just for simplicity 320 315 * sake. 321 316 */ 322 - size = ALIGN(size, root->fs_info->sectorsize); 323 - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); 317 + ret = btrfs_inode_set_file_extent_range(inode, 0, 318 + ALIGN(size, root->fs_info->sectorsize)); 324 319 if (ret) 325 320 goto fail; 326 321 327 322 /* 328 - * we're an inline extent, so nobody can 329 - * extend the file past i_size without locking 330 - * a page we already have locked. 323 + * We're an inline extent, so nobody can extend the file past i_size 324 + * without locking a page we already have locked. 331 325 * 332 - * We must do any isize and inode updates 333 - * before we unlock the pages. Otherwise we 334 - * could end up racing with unlink. 326 + * We must do any i_size and inode updates before we unlock the pages. 327 + * Otherwise we could end up racing with unlink. 335 328 */ 336 - BTRFS_I(inode)->disk_i_size = inode->i_size; 329 + i_size = i_size_read(&inode->vfs_inode); 330 + if (update_i_size && size > i_size) { 331 + i_size_write(&inode->vfs_inode, size); 332 + i_size = size; 333 + } 334 + inode->disk_i_size = i_size; 335 + 337 336 fail: 338 337 return ret; 339 338 } ··· 348 339 * does the checks required to make sure the data is small enough 349 340 * to fit as an inline extent. 350 341 */ 351 - static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, 352 - u64 end, size_t compressed_size, 342 + static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, 343 + size_t compressed_size, 353 344 int compress_type, 354 - struct page **compressed_pages) 345 + struct page **compressed_pages, 346 + bool update_i_size) 355 347 { 356 348 struct btrfs_drop_extents_args drop_args = { 0 }; 357 349 struct btrfs_root *root = inode->root; 358 350 struct btrfs_fs_info *fs_info = root->fs_info; 359 351 struct btrfs_trans_handle *trans; 360 - u64 isize = i_size_read(&inode->vfs_inode); 361 - u64 actual_end = min(end + 1, isize); 362 - u64 inline_len = actual_end - start; 363 - u64 aligned_end = ALIGN(end, fs_info->sectorsize); 364 - u64 data_len = inline_len; 352 + u64 data_len = (compressed_size ?: size); 365 353 int ret; 366 354 struct btrfs_path *path; 367 355 368 - if (compressed_size) 369 - data_len = compressed_size; 370 - 371 - if (start > 0 || 372 - actual_end > fs_info->sectorsize || 356 + /* 357 + * We can create an inline extent if it ends at or beyond the current 358 + * i_size, is no larger than a sector (decompressed), and the (possibly 359 + * compressed) data fits in a leaf and the configured maximum inline 360 + * size. 361 + */ 362 + if (size < i_size_read(&inode->vfs_inode) || 363 + size > fs_info->sectorsize || 373 364 data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || 374 - (!compressed_size && 375 - (actual_end & (fs_info->sectorsize - 1)) == 0) || 376 - end + 1 < isize || 377 - data_len > fs_info->max_inline) { 365 + data_len > fs_info->max_inline) 378 366 return 1; 379 - } 380 367 381 368 path = btrfs_alloc_path(); 382 369 if (!path) ··· 386 381 trans->block_rsv = &inode->block_rsv; 387 382 388 383 drop_args.path = path; 389 - drop_args.start = start; 390 - drop_args.end = aligned_end; 384 + drop_args.start = 0; 385 + drop_args.end = fs_info->sectorsize; 391 386 drop_args.drop_cache = true; 392 387 drop_args.replace_extent = true; 393 - 394 - if (compressed_size && compressed_pages) 395 - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( 396 - compressed_size); 397 - else 398 - drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( 399 - inline_len); 400 - 388 + drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); 401 389 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 402 390 if (ret) { 403 391 btrfs_abort_transaction(trans, ret); 404 392 goto out; 405 393 } 406 394 407 - if (isize > actual_end) 408 - inline_len = min_t(u64, isize, actual_end); 409 - ret = insert_inline_extent(trans, path, drop_args.extent_inserted, 410 - root, &inode->vfs_inode, start, 411 - inline_len, compressed_size, 412 - compress_type, compressed_pages); 395 + ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, 396 + size, compressed_size, compress_type, 397 + compressed_pages, update_i_size); 413 398 if (ret && ret != -ENOSPC) { 414 399 btrfs_abort_transaction(trans, ret); 415 400 goto out; ··· 408 413 goto out; 409 414 } 410 415 411 - btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); 416 + btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); 412 417 ret = btrfs_update_inode(trans, root, inode); 413 418 if (ret && ret != -ENOSPC) { 414 419 btrfs_abort_transaction(trans, ret); ··· 418 423 goto out; 419 424 } 420 425 421 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); 426 + btrfs_set_inode_full_sync(inode); 422 427 out: 423 428 /* 424 429 * Don't forget to free the reserved space, as for inlined extent ··· 619 624 again: 620 625 will_compress = 0; 621 626 nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; 622 - BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); 623 627 nr_pages = min_t(unsigned long, nr_pages, 624 628 BTRFS_MAX_COMPRESSED / PAGE_SIZE); 625 629 ··· 729 735 /* we didn't compress the entire range, try 730 736 * to make an uncompressed inline extent. 731 737 */ 732 - ret = cow_file_range_inline(BTRFS_I(inode), start, end, 738 + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 733 739 0, BTRFS_COMPRESS_NONE, 734 - NULL); 740 + NULL, false); 735 741 } else { 736 742 /* try making a compressed inline extent */ 737 - ret = cow_file_range_inline(BTRFS_I(inode), start, end, 743 + ret = cow_file_range_inline(BTRFS_I(inode), actual_end, 738 744 total_compressed, 739 - compress_type, pages); 745 + compress_type, pages, 746 + false); 740 747 } 741 748 if (ret <= 0) { 742 749 unsigned long clear_flags = EXTENT_DELALLOC | ··· 976 981 } 977 982 free_extent_map(em); 978 983 979 - ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ 980 - ins.objectid, /* disk_bytenr */ 981 - async_extent->ram_size, /* num_bytes */ 982 - ins.offset, /* disk_num_bytes */ 983 - async_extent->compress_type); 984 + ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ 985 + async_extent->ram_size, /* num_bytes */ 986 + async_extent->ram_size, /* ram_bytes */ 987 + ins.objectid, /* disk_bytenr */ 988 + ins.offset, /* disk_num_bytes */ 989 + 0, /* offset */ 990 + 1 << BTRFS_ORDERED_COMPRESSED, 991 + async_extent->compress_type); 984 992 if (ret) { 985 993 btrfs_drop_extent_cache(inode, start, end, 0); 986 994 goto out_free_reserve; ··· 1001 1003 async_extent->pages, /* compressed_pages */ 1002 1004 async_extent->nr_pages, 1003 1005 async_chunk->write_flags, 1004 - async_chunk->blkcg_css)) { 1006 + async_chunk->blkcg_css, true)) { 1005 1007 const u64 start = async_extent->start; 1006 1008 const u64 end = start + async_extent->ram_size - 1; 1007 1009 ··· 1150 1152 * So here we skip inline extent creation completely. 1151 1153 */ 1152 1154 if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { 1155 + u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), 1156 + end + 1); 1157 + 1153 1158 /* lets try to make an inline extent */ 1154 - ret = cow_file_range_inline(inode, start, end, 0, 1155 - BTRFS_COMPRESS_NONE, NULL); 1159 + ret = cow_file_range_inline(inode, actual_end, 0, 1160 + BTRFS_COMPRESS_NONE, NULL, false); 1156 1161 if (ret == 0) { 1157 1162 /* 1158 1163 * We use DO_ACCOUNTING here because we need the ··· 1235 1234 } 1236 1235 free_extent_map(em); 1237 1236 1238 - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 1239 - ram_size, cur_alloc_size, 1240 - BTRFS_ORDERED_REGULAR); 1237 + ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, 1238 + ins.objectid, cur_alloc_size, 0, 1239 + 1 << BTRFS_ORDERED_REGULAR, 1240 + BTRFS_COMPRESS_NONE); 1241 1241 if (ret) 1242 1242 goto out_drop_extent_cache; 1243 1243 ··· 1897 1895 goto error; 1898 1896 } 1899 1897 free_extent_map(em); 1900 - ret = btrfs_add_ordered_extent(inode, cur_offset, 1901 - disk_bytenr, num_bytes, 1902 - num_bytes, 1903 - BTRFS_ORDERED_PREALLOC); 1898 + ret = btrfs_add_ordered_extent(inode, 1899 + cur_offset, num_bytes, num_bytes, 1900 + disk_bytenr, num_bytes, 0, 1901 + 1 << BTRFS_ORDERED_PREALLOC, 1902 + BTRFS_COMPRESS_NONE); 1904 1903 if (ret) { 1905 1904 btrfs_drop_extent_cache(inode, cur_offset, 1906 1905 cur_offset + num_bytes - 1, ··· 1910 1907 } 1911 1908 } else { 1912 1909 ret = btrfs_add_ordered_extent(inode, cur_offset, 1910 + num_bytes, num_bytes, 1913 1911 disk_bytenr, num_bytes, 1914 - num_bytes, 1915 - BTRFS_ORDERED_NOCOW); 1912 + 0, 1913 + 1 << BTRFS_ORDERED_NOCOW, 1914 + BTRFS_COMPRESS_NONE); 1916 1915 if (ret) 1917 1916 goto error; 1918 1917 } ··· 2315 2310 static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, 2316 2311 u64 dio_file_offset) 2317 2312 { 2318 - return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2313 + return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); 2319 2314 } 2320 2315 2321 2316 /* ··· 2543 2538 goto out; 2544 2539 2545 2540 if (bio_flags & EXTENT_BIO_COMPRESSED) { 2541 + /* 2542 + * btrfs_submit_compressed_read will handle completing 2543 + * the bio if there were any errors, so just return 2544 + * here. 2545 + */ 2546 2546 ret = btrfs_submit_compressed_read(inode, bio, 2547 2547 mirror_num, 2548 2548 bio_flags); 2549 - goto out; 2549 + goto out_no_endio; 2550 2550 } else { 2551 2551 /* 2552 2552 * Lookup bio sums does extra checks around whether we ··· 2572 2562 0, btrfs_submit_bio_start); 2573 2563 goto out; 2574 2564 } else if (!skip_sum) { 2575 - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2565 + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); 2576 2566 if (ret) 2577 2567 goto out; 2578 2568 } ··· 2585 2575 bio->bi_status = ret; 2586 2576 bio_endio(bio); 2587 2577 } 2578 + out_no_endio: 2588 2579 return ret; 2589 2580 } 2590 2581 ··· 2881 2870 struct btrfs_key ins; 2882 2871 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); 2883 2872 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); 2873 + u64 offset = btrfs_stack_file_extent_offset(stack_fi); 2884 2874 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); 2885 2875 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); 2886 2876 struct btrfs_drop_extents_args drop_args = { 0 }; ··· 2956 2944 goto out; 2957 2945 2958 2946 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 2959 - file_pos, qgroup_reserved, &ins); 2947 + file_pos - offset, 2948 + qgroup_reserved, &ins); 2960 2949 out: 2961 2950 btrfs_free_path(path); 2962 2951 ··· 2983 2970 struct btrfs_ordered_extent *oe) 2984 2971 { 2985 2972 struct btrfs_file_extent_item stack_fi; 2986 - u64 logical_len; 2987 2973 bool update_inode_bytes; 2974 + u64 num_bytes = oe->num_bytes; 2975 + u64 ram_bytes = oe->ram_bytes; 2988 2976 2989 2977 memset(&stack_fi, 0, sizeof(stack_fi)); 2990 2978 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); 2991 2979 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); 2992 2980 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, 2993 2981 oe->disk_num_bytes); 2982 + btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); 2994 2983 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) 2995 - logical_len = oe->truncated_len; 2996 - else 2997 - logical_len = oe->num_bytes; 2998 - btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); 2999 - btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); 2984 + num_bytes = ram_bytes = oe->truncated_len; 2985 + btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); 2986 + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); 3000 2987 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); 3001 2988 /* Encryption and other encoding is reserved and all 0 */ 3002 2989 ··· 3007 2994 * except if the ordered extent was truncated. 3008 2995 */ 3009 2996 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || 2997 + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || 3010 2998 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); 3011 2999 3012 3000 return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), ··· 3042 3028 3043 3029 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && 3044 3030 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && 3045 - !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) 3031 + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && 3032 + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) 3046 3033 clear_bits |= EXTENT_DELALLOC_NEW; 3047 3034 3048 3035 freespace_inode = btrfs_is_free_space_inode(inode); ··· 4077 4062 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, 4078 4063 struct btrfs_inode *dir, 4079 4064 struct btrfs_inode *inode, 4080 - const char *name, int name_len) 4065 + const char *name, int name_len, 4066 + struct btrfs_rename_ctx *rename_ctx) 4081 4067 { 4082 4068 struct btrfs_root *root = dir->root; 4083 4069 struct btrfs_fs_info *fs_info = root->fs_info; ··· 4134 4118 goto err; 4135 4119 } 4136 4120 skip_backref: 4121 + if (rename_ctx) 4122 + rename_ctx->index = index; 4123 + 4137 4124 ret = btrfs_delete_delayed_dir_index(trans, dir, index); 4138 4125 if (ret) { 4139 4126 btrfs_abort_transaction(trans, ret); 4140 4127 goto err; 4141 4128 } 4142 4129 4143 - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, 4144 - dir_ino); 4145 - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); 4130 + /* 4131 + * If we are in a rename context, we don't need to update anything in the 4132 + * log. That will be done later during the rename by btrfs_log_new_name(). 4133 + * Besides that, doing it here would only cause extra unncessary btree 4134 + * operations on the log tree, increasing latency for applications. 4135 + */ 4136 + if (!rename_ctx) { 4137 + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, 4138 + dir_ino); 4139 + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, 4140 + index); 4141 + } 4146 4142 4147 4143 /* 4148 4144 * If we have a pending delayed iput we could end up with the final iput ··· 4186 4158 const char *name, int name_len) 4187 4159 { 4188 4160 int ret; 4189 - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); 4161 + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); 4190 4162 if (!ret) { 4191 4163 drop_nlink(&inode->vfs_inode); 4192 4164 ret = btrfs_update_inode(trans, inode->root, inode); ··· 4593 4565 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) 4594 4566 { 4595 4567 struct inode *inode = d_inode(dentry); 4568 + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 4596 4569 int err = 0; 4597 4570 struct btrfs_trans_handle *trans; 4598 4571 u64 last_unlink_trans; 4599 4572 4600 4573 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4601 4574 return -ENOTEMPTY; 4602 - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) 4575 + if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { 4576 + if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { 4577 + btrfs_err(fs_info, 4578 + "extent tree v2 doesn't support snapshot deletion yet"); 4579 + return -EOPNOTSUPP; 4580 + } 4603 4581 return btrfs_delete_subvolume(dir, dentry); 4582 + } 4604 4583 4605 4584 trans = __unlink_start_trans(dir); 4606 4585 if (IS_ERR(trans)) ··· 4646 4611 } 4647 4612 out: 4648 4613 btrfs_end_transaction(trans); 4649 - btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); 4614 + btrfs_btree_balance_dirty(fs_info); 4650 4615 4651 4616 return err; 4652 4617 } ··· 4699 4664 goto out; 4700 4665 } 4701 4666 } 4702 - ret = btrfs_delalloc_reserve_metadata(inode, blocksize); 4667 + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); 4703 4668 if (ret < 0) { 4704 4669 if (!only_release_metadata) 4705 4670 btrfs_free_reserved_data_space(inode, data_reserved, ··· 4911 4876 cur_offset + hole_size - 1, 0); 4912 4877 hole_em = alloc_extent_map(); 4913 4878 if (!hole_em) { 4914 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4915 - &inode->runtime_flags); 4879 + btrfs_set_inode_full_sync(inode); 4916 4880 goto next; 4917 4881 } 4918 4882 hole_em->start = cur_offset; ··· 5618 5584 return inode; 5619 5585 } 5620 5586 5587 + static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); 5588 + static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); 5589 + static_assert(BTRFS_FT_DIR == FT_DIR); 5590 + static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); 5591 + static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); 5592 + static_assert(BTRFS_FT_FIFO == FT_FIFO); 5593 + static_assert(BTRFS_FT_SOCK == FT_SOCK); 5594 + static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); 5595 + 5621 5596 static inline u8 btrfs_inode_type(struct inode *inode) 5622 5597 { 5623 - /* 5624 - * Compile-time asserts that generic FT_* types still match 5625 - * BTRFS_FT_* types 5626 - */ 5627 - BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); 5628 - BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); 5629 - BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); 5630 - BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); 5631 - BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); 5632 - BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); 5633 - BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); 5634 - BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); 5635 - 5636 5598 return fs_umode_to_ftype(inode->i_mode); 5637 5599 } 5638 5600 ··· 6001 5971 goto out; 6002 5972 ret = 0; 6003 5973 6004 - /* 6005 - * MAGIC NUMBER EXPLANATION: 6006 - * since we search a directory based on f_pos we have to start at 2 6007 - * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody 6008 - * else has to start at 2 6009 - */ 6010 5974 if (path->slots[0] == 0) { 6011 - inode->index_cnt = 2; 5975 + inode->index_cnt = BTRFS_DIR_START_INDEX; 6012 5976 goto out; 6013 5977 } 6014 5978 ··· 6013 5989 6014 5990 if (found_key.objectid != btrfs_ino(inode) || 6015 5991 found_key.type != BTRFS_DIR_INDEX_KEY) { 6016 - inode->index_cnt = 2; 5992 + inode->index_cnt = BTRFS_DIR_START_INDEX; 6017 5993 goto out; 6018 5994 } 6019 5995 ··· 6164 6140 * sync since it will be a full sync anyway and this will blow away the 6165 6141 * old info in the log. 6166 6142 */ 6167 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 6143 + btrfs_set_inode_full_sync(BTRFS_I(inode)); 6168 6144 6169 6145 key[0].objectid = objectid; 6170 6146 key[0].type = BTRFS_INODE_ITEM_KEY; ··· 6561 6537 goto fail; 6562 6538 } 6563 6539 d_instantiate(dentry, inode); 6564 - btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); 6540 + btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); 6565 6541 } 6566 6542 6567 6543 fail: ··· 7064 7040 if (IS_ERR(em)) 7065 7041 goto out; 7066 7042 } 7067 - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, 7068 - block_len, type); 7043 + ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, 7044 + block_len, 0, 7045 + (1 << type) | 7046 + (1 << BTRFS_ORDERED_DIRECT), 7047 + BTRFS_COMPRESS_NONE); 7069 7048 if (ret) { 7070 7049 if (em) { 7071 7050 free_extent_map(em); ··· 7468 7441 struct extent_map *em2; 7469 7442 7470 7443 /* We can NOCOW, so only need to reserve metadata space. */ 7471 - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 7444 + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); 7472 7445 if (ret < 0) { 7473 7446 /* Our caller expects us to free the input extent map. */ 7474 7447 free_extent_map(em); ··· 7858 7831 struct bio *bio, 7859 7832 u64 dio_file_offset) 7860 7833 { 7861 - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); 7834 + return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); 7862 7835 } 7863 7836 7864 7837 static void btrfs_end_dio_bio(struct bio *bio) ··· 7915 7888 * If we aren't doing async submit, calculate the csum of the 7916 7889 * bio now. 7917 7890 */ 7918 - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); 7891 + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); 7919 7892 if (ret) 7920 7893 goto err; 7921 7894 } else { ··· 8131 8104 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 8132 8105 8133 8106 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 8134 - if (bio_ctrl.bio) 8135 - ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); 8107 + if (bio_ctrl.bio) { 8108 + int ret2; 8109 + 8110 + ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); 8111 + if (ret == 0) 8112 + ret = ret2; 8113 + } 8136 8114 return ret; 8137 8115 } 8138 8116 ··· 8766 8734 * extents beyond i_size to drop. 8767 8735 */ 8768 8736 if (control.extents_found > 0) 8769 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 8737 + btrfs_set_inode_full_sync(BTRFS_I(inode)); 8770 8738 8771 8739 return ret; 8772 8740 } ··· 9062 9030 struct inode *new_inode = new_dentry->d_inode; 9063 9031 struct inode *old_inode = old_dentry->d_inode; 9064 9032 struct timespec64 ctime = current_time(old_inode); 9033 + struct btrfs_rename_ctx old_rename_ctx; 9034 + struct btrfs_rename_ctx new_rename_ctx; 9065 9035 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9066 9036 u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); 9067 9037 u64 old_idx = 0; 9068 9038 u64 new_idx = 0; 9069 9039 int ret; 9070 9040 int ret2; 9071 - bool root_log_pinned = false; 9072 - bool dest_log_pinned = false; 9073 9041 bool need_abort = false; 9074 9042 9075 9043 /* ··· 9172 9140 BTRFS_I(new_inode), 1); 9173 9141 } 9174 9142 9175 - /* 9176 - * Now pin the logs of the roots. We do it to ensure that no other task 9177 - * can sync the logs while we are in progress with the rename, because 9178 - * that could result in an inconsistency in case any of the inodes that 9179 - * are part of this rename operation were logged before. 9180 - * 9181 - * We pin the logs even if at this precise moment none of the inodes was 9182 - * logged before. This is because right after we checked for that, some 9183 - * other task fsyncing some other inode not involved with this rename 9184 - * operation could log that one of our inodes exists. 9185 - * 9186 - * We don't need to pin the logs before the above calls to 9187 - * btrfs_insert_inode_ref(), since those don't ever need to change a log. 9188 - */ 9189 - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 9190 - btrfs_pin_log_trans(root); 9191 - root_log_pinned = true; 9192 - } 9193 - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { 9194 - btrfs_pin_log_trans(dest); 9195 - dest_log_pinned = true; 9196 - } 9197 - 9198 9143 /* src is a subvolume */ 9199 9144 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9200 9145 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); ··· 9179 9170 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 9180 9171 BTRFS_I(old_dentry->d_inode), 9181 9172 old_dentry->d_name.name, 9182 - old_dentry->d_name.len); 9173 + old_dentry->d_name.len, 9174 + &old_rename_ctx); 9183 9175 if (!ret) 9184 9176 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9185 9177 } ··· 9196 9186 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), 9197 9187 BTRFS_I(new_dentry->d_inode), 9198 9188 new_dentry->d_name.name, 9199 - new_dentry->d_name.len); 9189 + new_dentry->d_name.len, 9190 + &new_rename_ctx); 9200 9191 if (!ret) 9201 9192 ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); 9202 9193 } ··· 9227 9216 if (new_inode->i_nlink == 1) 9228 9217 BTRFS_I(new_inode)->dir_index = new_idx; 9229 9218 9230 - if (root_log_pinned) { 9231 - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), 9232 - new_dentry->d_parent); 9233 - btrfs_end_log_trans(root); 9234 - root_log_pinned = false; 9235 - } 9236 - if (dest_log_pinned) { 9237 - btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), 9238 - old_dentry->d_parent); 9239 - btrfs_end_log_trans(dest); 9240 - dest_log_pinned = false; 9241 - } 9242 - out_fail: 9243 9219 /* 9244 - * If we have pinned a log and an error happened, we unpin tasks 9245 - * trying to sync the log and force them to fallback to a transaction 9246 - * commit if the log currently contains any of the inodes involved in 9247 - * this rename operation (to ensure we do not persist a log with an 9248 - * inconsistent state for any of these inodes or leading to any 9249 - * inconsistencies when replayed). If the transaction was aborted, the 9250 - * abortion reason is propagated to userspace when attempting to commit 9251 - * the transaction. If the log does not contain any of these inodes, we 9252 - * allow the tasks to sync it. 9220 + * Now pin the logs of the roots. We do it to ensure that no other task 9221 + * can sync the logs while we are in progress with the rename, because 9222 + * that could result in an inconsistency in case any of the inodes that 9223 + * are part of this rename operation were logged before. 9253 9224 */ 9254 - if (ret && (root_log_pinned || dest_log_pinned)) { 9255 - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9256 - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9257 - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9258 - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) 9259 - btrfs_set_log_full_commit(trans); 9225 + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9226 + btrfs_pin_log_trans(root); 9227 + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 9228 + btrfs_pin_log_trans(dest); 9260 9229 9261 - if (root_log_pinned) { 9262 - btrfs_end_log_trans(root); 9263 - root_log_pinned = false; 9264 - } 9265 - if (dest_log_pinned) { 9266 - btrfs_end_log_trans(dest); 9267 - dest_log_pinned = false; 9268 - } 9269 - } 9230 + /* Do the log updates for all inodes. */ 9231 + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9232 + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 9233 + old_rename_ctx.index, new_dentry->d_parent); 9234 + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 9235 + btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), 9236 + new_rename_ctx.index, old_dentry->d_parent); 9237 + 9238 + /* Now unpin the logs. */ 9239 + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9240 + btrfs_end_log_trans(root); 9241 + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) 9242 + btrfs_end_log_trans(dest); 9243 + out_fail: 9270 9244 ret2 = btrfs_end_transaction(trans); 9271 9245 ret = ret ? ret : ret2; 9272 9246 out_notrans: ··· 9326 9330 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9327 9331 struct inode *new_inode = d_inode(new_dentry); 9328 9332 struct inode *old_inode = d_inode(old_dentry); 9333 + struct btrfs_rename_ctx rename_ctx; 9329 9334 u64 index = 0; 9330 9335 int ret; 9331 9336 int ret2; 9332 9337 u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); 9333 - bool log_pinned = false; 9334 9338 9335 9339 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9336 9340 return -EPERM; ··· 9435 9439 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 9436 9440 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); 9437 9441 } else { 9438 - /* 9439 - * Now pin the log. We do it to ensure that no other task can 9440 - * sync the log while we are in progress with the rename, as 9441 - * that could result in an inconsistency in case any of the 9442 - * inodes that are part of this rename operation were logged 9443 - * before. 9444 - * 9445 - * We pin the log even if at this precise moment none of the 9446 - * inodes was logged before. This is because right after we 9447 - * checked for that, some other task fsyncing some other inode 9448 - * not involved with this rename operation could log that one of 9449 - * our inodes exists. 9450 - * 9451 - * We don't need to pin the logs before the above call to 9452 - * btrfs_insert_inode_ref(), since that does not need to change 9453 - * a log. 9454 - */ 9455 - btrfs_pin_log_trans(root); 9456 - log_pinned = true; 9457 9442 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), 9458 9443 BTRFS_I(d_inode(old_dentry)), 9459 9444 old_dentry->d_name.name, 9460 - old_dentry->d_name.len); 9445 + old_dentry->d_name.len, 9446 + &rename_ctx); 9461 9447 if (!ret) 9462 9448 ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); 9463 9449 } ··· 9481 9503 if (old_inode->i_nlink == 1) 9482 9504 BTRFS_I(old_inode)->dir_index = index; 9483 9505 9484 - if (log_pinned) { 9485 - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), 9486 - new_dentry->d_parent); 9487 - btrfs_end_log_trans(root); 9488 - log_pinned = false; 9489 - } 9506 + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) 9507 + btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), 9508 + rename_ctx.index, new_dentry->d_parent); 9490 9509 9491 9510 if (flags & RENAME_WHITEOUT) { 9492 9511 ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, ··· 9495 9520 } 9496 9521 } 9497 9522 out_fail: 9498 - /* 9499 - * If we have pinned the log and an error happened, we unpin tasks 9500 - * trying to sync the log and force them to fallback to a transaction 9501 - * commit if the log currently contains any of the inodes involved in 9502 - * this rename operation (to ensure we do not persist a log with an 9503 - * inconsistent state for any of these inodes or leading to any 9504 - * inconsistencies when replayed). If the transaction was aborted, the 9505 - * abortion reason is propagated to userspace when attempting to commit 9506 - * the transaction. If the log does not contain any of these inodes, we 9507 - * allow the tasks to sync it. 9508 - */ 9509 - if (ret && log_pinned) { 9510 - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || 9511 - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || 9512 - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || 9513 - (new_inode && 9514 - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) 9515 - btrfs_set_log_full_commit(trans); 9516 - 9517 - btrfs_end_log_trans(root); 9518 - log_pinned = false; 9519 - } 9520 9523 ret2 = btrfs_end_transaction(trans); 9521 9524 ret = ret ? ret : ret2; 9522 9525 out_notrans: ··· 9974 10021 9975 10022 em = alloc_extent_map(); 9976 10023 if (!em) { 9977 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 9978 - &BTRFS_I(inode)->runtime_flags); 10024 + btrfs_set_inode_full_sync(BTRFS_I(inode)); 9979 10025 goto next; 9980 10026 } 9981 10027 ··· 10160 10208 put_page(page); 10161 10209 index++; 10162 10210 } 10211 + } 10212 + 10213 + static int btrfs_encoded_io_compression_from_extent( 10214 + struct btrfs_fs_info *fs_info, 10215 + int compress_type) 10216 + { 10217 + switch (compress_type) { 10218 + case BTRFS_COMPRESS_NONE: 10219 + return BTRFS_ENCODED_IO_COMPRESSION_NONE; 10220 + case BTRFS_COMPRESS_ZLIB: 10221 + return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; 10222 + case BTRFS_COMPRESS_LZO: 10223 + /* 10224 + * The LZO format depends on the sector size. 64K is the maximum 10225 + * sector size that we support. 10226 + */ 10227 + if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) 10228 + return -EINVAL; 10229 + return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 10230 + (fs_info->sectorsize_bits - 12); 10231 + case BTRFS_COMPRESS_ZSTD: 10232 + return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; 10233 + default: 10234 + return -EUCLEAN; 10235 + } 10236 + } 10237 + 10238 + static ssize_t btrfs_encoded_read_inline( 10239 + struct kiocb *iocb, 10240 + struct iov_iter *iter, u64 start, 10241 + u64 lockend, 10242 + struct extent_state **cached_state, 10243 + u64 extent_start, size_t count, 10244 + struct btrfs_ioctl_encoded_io_args *encoded, 10245 + bool *unlocked) 10246 + { 10247 + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10248 + struct btrfs_root *root = inode->root; 10249 + struct btrfs_fs_info *fs_info = root->fs_info; 10250 + struct extent_io_tree *io_tree = &inode->io_tree; 10251 + struct btrfs_path *path; 10252 + struct extent_buffer *leaf; 10253 + struct btrfs_file_extent_item *item; 10254 + u64 ram_bytes; 10255 + unsigned long ptr; 10256 + void *tmp; 10257 + ssize_t ret; 10258 + 10259 + path = btrfs_alloc_path(); 10260 + if (!path) { 10261 + ret = -ENOMEM; 10262 + goto out; 10263 + } 10264 + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), 10265 + extent_start, 0); 10266 + if (ret) { 10267 + if (ret > 0) { 10268 + /* The extent item disappeared? */ 10269 + ret = -EIO; 10270 + } 10271 + goto out; 10272 + } 10273 + leaf = path->nodes[0]; 10274 + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 10275 + 10276 + ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); 10277 + ptr = btrfs_file_extent_inline_start(item); 10278 + 10279 + encoded->len = min_t(u64, extent_start + ram_bytes, 10280 + inode->vfs_inode.i_size) - iocb->ki_pos; 10281 + ret = btrfs_encoded_io_compression_from_extent(fs_info, 10282 + btrfs_file_extent_compression(leaf, item)); 10283 + if (ret < 0) 10284 + goto out; 10285 + encoded->compression = ret; 10286 + if (encoded->compression) { 10287 + size_t inline_size; 10288 + 10289 + inline_size = btrfs_file_extent_inline_item_len(leaf, 10290 + path->slots[0]); 10291 + if (inline_size > count) { 10292 + ret = -ENOBUFS; 10293 + goto out; 10294 + } 10295 + count = inline_size; 10296 + encoded->unencoded_len = ram_bytes; 10297 + encoded->unencoded_offset = iocb->ki_pos - extent_start; 10298 + } else { 10299 + count = min_t(u64, count, encoded->len); 10300 + encoded->len = count; 10301 + encoded->unencoded_len = count; 10302 + ptr += iocb->ki_pos - extent_start; 10303 + } 10304 + 10305 + tmp = kmalloc(count, GFP_NOFS); 10306 + if (!tmp) { 10307 + ret = -ENOMEM; 10308 + goto out; 10309 + } 10310 + read_extent_buffer(leaf, tmp, ptr, count); 10311 + btrfs_release_path(path); 10312 + unlock_extent_cached(io_tree, start, lockend, cached_state); 10313 + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10314 + *unlocked = true; 10315 + 10316 + ret = copy_to_iter(tmp, count, iter); 10317 + if (ret != count) 10318 + ret = -EFAULT; 10319 + kfree(tmp); 10320 + out: 10321 + btrfs_free_path(path); 10322 + return ret; 10323 + } 10324 + 10325 + struct btrfs_encoded_read_private { 10326 + struct btrfs_inode *inode; 10327 + u64 file_offset; 10328 + wait_queue_head_t wait; 10329 + atomic_t pending; 10330 + blk_status_t status; 10331 + bool skip_csum; 10332 + }; 10333 + 10334 + static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, 10335 + struct bio *bio, int mirror_num) 10336 + { 10337 + struct btrfs_encoded_read_private *priv = bio->bi_private; 10338 + struct btrfs_bio *bbio = btrfs_bio(bio); 10339 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 10340 + blk_status_t ret; 10341 + 10342 + if (!priv->skip_csum) { 10343 + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); 10344 + if (ret) 10345 + return ret; 10346 + } 10347 + 10348 + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 10349 + if (ret) { 10350 + btrfs_bio_free_csum(bbio); 10351 + return ret; 10352 + } 10353 + 10354 + atomic_inc(&priv->pending); 10355 + ret = btrfs_map_bio(fs_info, bio, mirror_num); 10356 + if (ret) { 10357 + atomic_dec(&priv->pending); 10358 + btrfs_bio_free_csum(bbio); 10359 + } 10360 + return ret; 10361 + } 10362 + 10363 + static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) 10364 + { 10365 + const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK); 10366 + struct btrfs_encoded_read_private *priv = bbio->bio.bi_private; 10367 + struct btrfs_inode *inode = priv->inode; 10368 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 10369 + u32 sectorsize = fs_info->sectorsize; 10370 + struct bio_vec *bvec; 10371 + struct bvec_iter_all iter_all; 10372 + u64 start = priv->file_offset; 10373 + u32 bio_offset = 0; 10374 + 10375 + if (priv->skip_csum || !uptodate) 10376 + return bbio->bio.bi_status; 10377 + 10378 + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { 10379 + unsigned int i, nr_sectors, pgoff; 10380 + 10381 + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 10382 + pgoff = bvec->bv_offset; 10383 + for (i = 0; i < nr_sectors; i++) { 10384 + ASSERT(pgoff < PAGE_SIZE); 10385 + if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, 10386 + bvec->bv_page, pgoff, start)) 10387 + return BLK_STS_IOERR; 10388 + start += sectorsize; 10389 + bio_offset += sectorsize; 10390 + pgoff += sectorsize; 10391 + } 10392 + } 10393 + return BLK_STS_OK; 10394 + } 10395 + 10396 + static void btrfs_encoded_read_endio(struct bio *bio) 10397 + { 10398 + struct btrfs_encoded_read_private *priv = bio->bi_private; 10399 + struct btrfs_bio *bbio = btrfs_bio(bio); 10400 + blk_status_t status; 10401 + 10402 + status = btrfs_encoded_read_verify_csum(bbio); 10403 + if (status) { 10404 + /* 10405 + * The memory barrier implied by the atomic_dec_return() here 10406 + * pairs with the memory barrier implied by the 10407 + * atomic_dec_return() or io_wait_event() in 10408 + * btrfs_encoded_read_regular_fill_pages() to ensure that this 10409 + * write is observed before the load of status in 10410 + * btrfs_encoded_read_regular_fill_pages(). 10411 + */ 10412 + WRITE_ONCE(priv->status, status); 10413 + } 10414 + if (!atomic_dec_return(&priv->pending)) 10415 + wake_up(&priv->wait); 10416 + btrfs_bio_free_csum(bbio); 10417 + bio_put(bio); 10418 + } 10419 + 10420 + static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, 10421 + u64 file_offset, 10422 + u64 disk_bytenr, 10423 + u64 disk_io_size, 10424 + struct page **pages) 10425 + { 10426 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 10427 + struct btrfs_encoded_read_private priv = { 10428 + .inode = inode, 10429 + .file_offset = file_offset, 10430 + .pending = ATOMIC_INIT(1), 10431 + .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM), 10432 + }; 10433 + unsigned long i = 0; 10434 + u64 cur = 0; 10435 + int ret; 10436 + 10437 + init_waitqueue_head(&priv.wait); 10438 + /* 10439 + * Submit bios for the extent, splitting due to bio or stripe limits as 10440 + * necessary. 10441 + */ 10442 + while (cur < disk_io_size) { 10443 + struct extent_map *em; 10444 + struct btrfs_io_geometry geom; 10445 + struct bio *bio = NULL; 10446 + u64 remaining; 10447 + 10448 + em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur, 10449 + disk_io_size - cur); 10450 + if (IS_ERR(em)) { 10451 + ret = PTR_ERR(em); 10452 + } else { 10453 + ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ, 10454 + disk_bytenr + cur, &geom); 10455 + free_extent_map(em); 10456 + } 10457 + if (ret) { 10458 + WRITE_ONCE(priv.status, errno_to_blk_status(ret)); 10459 + break; 10460 + } 10461 + remaining = min(geom.len, disk_io_size - cur); 10462 + while (bio || remaining) { 10463 + size_t bytes = min_t(u64, remaining, PAGE_SIZE); 10464 + 10465 + if (!bio) { 10466 + bio = btrfs_bio_alloc(BIO_MAX_VECS); 10467 + bio->bi_iter.bi_sector = 10468 + (disk_bytenr + cur) >> SECTOR_SHIFT; 10469 + bio->bi_end_io = btrfs_encoded_read_endio; 10470 + bio->bi_private = &priv; 10471 + bio->bi_opf = REQ_OP_READ; 10472 + } 10473 + 10474 + if (!bytes || 10475 + bio_add_page(bio, pages[i], bytes, 0) < bytes) { 10476 + blk_status_t status; 10477 + 10478 + status = submit_encoded_read_bio(inode, bio, 0); 10479 + if (status) { 10480 + WRITE_ONCE(priv.status, status); 10481 + bio_put(bio); 10482 + goto out; 10483 + } 10484 + bio = NULL; 10485 + continue; 10486 + } 10487 + 10488 + i++; 10489 + cur += bytes; 10490 + remaining -= bytes; 10491 + } 10492 + } 10493 + 10494 + out: 10495 + if (atomic_dec_return(&priv.pending)) 10496 + io_wait_event(priv.wait, !atomic_read(&priv.pending)); 10497 + /* See btrfs_encoded_read_endio() for ordering. */ 10498 + return blk_status_to_errno(READ_ONCE(priv.status)); 10499 + } 10500 + 10501 + static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, 10502 + struct iov_iter *iter, 10503 + u64 start, u64 lockend, 10504 + struct extent_state **cached_state, 10505 + u64 disk_bytenr, u64 disk_io_size, 10506 + size_t count, bool compressed, 10507 + bool *unlocked) 10508 + { 10509 + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10510 + struct extent_io_tree *io_tree = &inode->io_tree; 10511 + struct page **pages; 10512 + unsigned long nr_pages, i; 10513 + u64 cur; 10514 + size_t page_offset; 10515 + ssize_t ret; 10516 + 10517 + nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); 10518 + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 10519 + if (!pages) 10520 + return -ENOMEM; 10521 + for (i = 0; i < nr_pages; i++) { 10522 + pages[i] = alloc_page(GFP_NOFS); 10523 + if (!pages[i]) { 10524 + ret = -ENOMEM; 10525 + goto out; 10526 + } 10527 + } 10528 + 10529 + ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, 10530 + disk_io_size, pages); 10531 + if (ret) 10532 + goto out; 10533 + 10534 + unlock_extent_cached(io_tree, start, lockend, cached_state); 10535 + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10536 + *unlocked = true; 10537 + 10538 + if (compressed) { 10539 + i = 0; 10540 + page_offset = 0; 10541 + } else { 10542 + i = (iocb->ki_pos - start) >> PAGE_SHIFT; 10543 + page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); 10544 + } 10545 + cur = 0; 10546 + while (cur < count) { 10547 + size_t bytes = min_t(size_t, count - cur, 10548 + PAGE_SIZE - page_offset); 10549 + 10550 + if (copy_page_to_iter(pages[i], page_offset, bytes, 10551 + iter) != bytes) { 10552 + ret = -EFAULT; 10553 + goto out; 10554 + } 10555 + i++; 10556 + cur += bytes; 10557 + page_offset = 0; 10558 + } 10559 + ret = count; 10560 + out: 10561 + for (i = 0; i < nr_pages; i++) { 10562 + if (pages[i]) 10563 + __free_page(pages[i]); 10564 + } 10565 + kfree(pages); 10566 + return ret; 10567 + } 10568 + 10569 + ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, 10570 + struct btrfs_ioctl_encoded_io_args *encoded) 10571 + { 10572 + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10573 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 10574 + struct extent_io_tree *io_tree = &inode->io_tree; 10575 + ssize_t ret; 10576 + size_t count = iov_iter_count(iter); 10577 + u64 start, lockend, disk_bytenr, disk_io_size; 10578 + struct extent_state *cached_state = NULL; 10579 + struct extent_map *em; 10580 + bool unlocked = false; 10581 + 10582 + file_accessed(iocb->ki_filp); 10583 + 10584 + btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10585 + 10586 + if (iocb->ki_pos >= inode->vfs_inode.i_size) { 10587 + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10588 + return 0; 10589 + } 10590 + start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); 10591 + /* 10592 + * We don't know how long the extent containing iocb->ki_pos is, but if 10593 + * it's compressed we know that it won't be longer than this. 10594 + */ 10595 + lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; 10596 + 10597 + for (;;) { 10598 + struct btrfs_ordered_extent *ordered; 10599 + 10600 + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, 10601 + lockend - start + 1); 10602 + if (ret) 10603 + goto out_unlock_inode; 10604 + lock_extent_bits(io_tree, start, lockend, &cached_state); 10605 + ordered = btrfs_lookup_ordered_range(inode, start, 10606 + lockend - start + 1); 10607 + if (!ordered) 10608 + break; 10609 + btrfs_put_ordered_extent(ordered); 10610 + unlock_extent_cached(io_tree, start, lockend, &cached_state); 10611 + cond_resched(); 10612 + } 10613 + 10614 + em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); 10615 + if (IS_ERR(em)) { 10616 + ret = PTR_ERR(em); 10617 + goto out_unlock_extent; 10618 + } 10619 + 10620 + if (em->block_start == EXTENT_MAP_INLINE) { 10621 + u64 extent_start = em->start; 10622 + 10623 + /* 10624 + * For inline extents we get everything we need out of the 10625 + * extent item. 10626 + */ 10627 + free_extent_map(em); 10628 + em = NULL; 10629 + ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, 10630 + &cached_state, extent_start, 10631 + count, encoded, &unlocked); 10632 + goto out; 10633 + } 10634 + 10635 + /* 10636 + * We only want to return up to EOF even if the extent extends beyond 10637 + * that. 10638 + */ 10639 + encoded->len = min_t(u64, extent_map_end(em), 10640 + inode->vfs_inode.i_size) - iocb->ki_pos; 10641 + if (em->block_start == EXTENT_MAP_HOLE || 10642 + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 10643 + disk_bytenr = EXTENT_MAP_HOLE; 10644 + count = min_t(u64, count, encoded->len); 10645 + encoded->len = count; 10646 + encoded->unencoded_len = count; 10647 + } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 10648 + disk_bytenr = em->block_start; 10649 + /* 10650 + * Bail if the buffer isn't large enough to return the whole 10651 + * compressed extent. 10652 + */ 10653 + if (em->block_len > count) { 10654 + ret = -ENOBUFS; 10655 + goto out_em; 10656 + } 10657 + disk_io_size = count = em->block_len; 10658 + encoded->unencoded_len = em->ram_bytes; 10659 + encoded->unencoded_offset = iocb->ki_pos - em->orig_start; 10660 + ret = btrfs_encoded_io_compression_from_extent(fs_info, 10661 + em->compress_type); 10662 + if (ret < 0) 10663 + goto out_em; 10664 + encoded->compression = ret; 10665 + } else { 10666 + disk_bytenr = em->block_start + (start - em->start); 10667 + if (encoded->len > count) 10668 + encoded->len = count; 10669 + /* 10670 + * Don't read beyond what we locked. This also limits the page 10671 + * allocations that we'll do. 10672 + */ 10673 + disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; 10674 + count = start + disk_io_size - iocb->ki_pos; 10675 + encoded->len = count; 10676 + encoded->unencoded_len = count; 10677 + disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); 10678 + } 10679 + free_extent_map(em); 10680 + em = NULL; 10681 + 10682 + if (disk_bytenr == EXTENT_MAP_HOLE) { 10683 + unlock_extent_cached(io_tree, start, lockend, &cached_state); 10684 + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10685 + unlocked = true; 10686 + ret = iov_iter_zero(count, iter); 10687 + if (ret != count) 10688 + ret = -EFAULT; 10689 + } else { 10690 + ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, 10691 + &cached_state, disk_bytenr, 10692 + disk_io_size, count, 10693 + encoded->compression, 10694 + &unlocked); 10695 + } 10696 + 10697 + out: 10698 + if (ret >= 0) 10699 + iocb->ki_pos += encoded->len; 10700 + out_em: 10701 + free_extent_map(em); 10702 + out_unlock_extent: 10703 + if (!unlocked) 10704 + unlock_extent_cached(io_tree, start, lockend, &cached_state); 10705 + out_unlock_inode: 10706 + if (!unlocked) 10707 + btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); 10708 + return ret; 10709 + } 10710 + 10711 + ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, 10712 + const struct btrfs_ioctl_encoded_io_args *encoded) 10713 + { 10714 + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); 10715 + struct btrfs_root *root = inode->root; 10716 + struct btrfs_fs_info *fs_info = root->fs_info; 10717 + struct extent_io_tree *io_tree = &inode->io_tree; 10718 + struct extent_changeset *data_reserved = NULL; 10719 + struct extent_state *cached_state = NULL; 10720 + int compression; 10721 + size_t orig_count; 10722 + u64 start, end; 10723 + u64 num_bytes, ram_bytes, disk_num_bytes; 10724 + unsigned long nr_pages, i; 10725 + struct page **pages; 10726 + struct btrfs_key ins; 10727 + bool extent_reserved = false; 10728 + struct extent_map *em; 10729 + ssize_t ret; 10730 + 10731 + switch (encoded->compression) { 10732 + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: 10733 + compression = BTRFS_COMPRESS_ZLIB; 10734 + break; 10735 + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: 10736 + compression = BTRFS_COMPRESS_ZSTD; 10737 + break; 10738 + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: 10739 + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: 10740 + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: 10741 + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: 10742 + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: 10743 + /* The sector size must match for LZO. */ 10744 + if (encoded->compression - 10745 + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != 10746 + fs_info->sectorsize_bits) 10747 + return -EINVAL; 10748 + compression = BTRFS_COMPRESS_LZO; 10749 + break; 10750 + default: 10751 + return -EINVAL; 10752 + } 10753 + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) 10754 + return -EINVAL; 10755 + 10756 + orig_count = iov_iter_count(from); 10757 + 10758 + /* The extent size must be sane. */ 10759 + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || 10760 + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) 10761 + return -EINVAL; 10762 + 10763 + /* 10764 + * The compressed data must be smaller than the decompressed data. 10765 + * 10766 + * It's of course possible for data to compress to larger or the same 10767 + * size, but the buffered I/O path falls back to no compression for such 10768 + * data, and we don't want to break any assumptions by creating these 10769 + * extents. 10770 + * 10771 + * Note that this is less strict than the current check we have that the 10772 + * compressed data must be at least one sector smaller than the 10773 + * decompressed data. We only want to enforce the weaker requirement 10774 + * from old kernels that it is at least one byte smaller. 10775 + */ 10776 + if (orig_count >= encoded->unencoded_len) 10777 + return -EINVAL; 10778 + 10779 + /* The extent must start on a sector boundary. */ 10780 + start = iocb->ki_pos; 10781 + if (!IS_ALIGNED(start, fs_info->sectorsize)) 10782 + return -EINVAL; 10783 + 10784 + /* 10785 + * The extent must end on a sector boundary. However, we allow a write 10786 + * which ends at or extends i_size to have an unaligned length; we round 10787 + * up the extent size and set i_size to the unaligned end. 10788 + */ 10789 + if (start + encoded->len < inode->vfs_inode.i_size && 10790 + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) 10791 + return -EINVAL; 10792 + 10793 + /* Finally, the offset in the unencoded data must be sector-aligned. */ 10794 + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) 10795 + return -EINVAL; 10796 + 10797 + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); 10798 + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); 10799 + end = start + num_bytes - 1; 10800 + 10801 + /* 10802 + * If the extent cannot be inline, the compressed data on disk must be 10803 + * sector-aligned. For convenience, we extend it with zeroes if it 10804 + * isn't. 10805 + */ 10806 + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); 10807 + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); 10808 + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); 10809 + if (!pages) 10810 + return -ENOMEM; 10811 + for (i = 0; i < nr_pages; i++) { 10812 + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); 10813 + char *kaddr; 10814 + 10815 + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); 10816 + if (!pages[i]) { 10817 + ret = -ENOMEM; 10818 + goto out_pages; 10819 + } 10820 + kaddr = kmap(pages[i]); 10821 + if (copy_from_iter(kaddr, bytes, from) != bytes) { 10822 + kunmap(pages[i]); 10823 + ret = -EFAULT; 10824 + goto out_pages; 10825 + } 10826 + if (bytes < PAGE_SIZE) 10827 + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); 10828 + kunmap(pages[i]); 10829 + } 10830 + 10831 + for (;;) { 10832 + struct btrfs_ordered_extent *ordered; 10833 + 10834 + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); 10835 + if (ret) 10836 + goto out_pages; 10837 + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, 10838 + start >> PAGE_SHIFT, 10839 + end >> PAGE_SHIFT); 10840 + if (ret) 10841 + goto out_pages; 10842 + lock_extent_bits(io_tree, start, end, &cached_state); 10843 + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); 10844 + if (!ordered && 10845 + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) 10846 + break; 10847 + if (ordered) 10848 + btrfs_put_ordered_extent(ordered); 10849 + unlock_extent_cached(io_tree, start, end, &cached_state); 10850 + cond_resched(); 10851 + } 10852 + 10853 + /* 10854 + * We don't use the higher-level delalloc space functions because our 10855 + * num_bytes and disk_num_bytes are different. 10856 + */ 10857 + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); 10858 + if (ret) 10859 + goto out_unlock; 10860 + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); 10861 + if (ret) 10862 + goto out_free_data_space; 10863 + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); 10864 + if (ret) 10865 + goto out_qgroup_free_data; 10866 + 10867 + /* Try an inline extent first. */ 10868 + if (start == 0 && encoded->unencoded_len == encoded->len && 10869 + encoded->unencoded_offset == 0) { 10870 + ret = cow_file_range_inline(inode, encoded->len, orig_count, 10871 + compression, pages, true); 10872 + if (ret <= 0) { 10873 + if (ret == 0) 10874 + ret = orig_count; 10875 + goto out_delalloc_release; 10876 + } 10877 + } 10878 + 10879 + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, 10880 + disk_num_bytes, 0, 0, &ins, 1, 1); 10881 + if (ret) 10882 + goto out_delalloc_release; 10883 + extent_reserved = true; 10884 + 10885 + em = create_io_em(inode, start, num_bytes, 10886 + start - encoded->unencoded_offset, ins.objectid, 10887 + ins.offset, ins.offset, ram_bytes, compression, 10888 + BTRFS_ORDERED_COMPRESSED); 10889 + if (IS_ERR(em)) { 10890 + ret = PTR_ERR(em); 10891 + goto out_free_reserved; 10892 + } 10893 + free_extent_map(em); 10894 + 10895 + ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, 10896 + ins.objectid, ins.offset, 10897 + encoded->unencoded_offset, 10898 + (1 << BTRFS_ORDERED_ENCODED) | 10899 + (1 << BTRFS_ORDERED_COMPRESSED), 10900 + compression); 10901 + if (ret) { 10902 + btrfs_drop_extent_cache(inode, start, end, 0); 10903 + goto out_free_reserved; 10904 + } 10905 + btrfs_dec_block_group_reservations(fs_info, ins.objectid); 10906 + 10907 + if (start + encoded->len > inode->vfs_inode.i_size) 10908 + i_size_write(&inode->vfs_inode, start + encoded->len); 10909 + 10910 + unlock_extent_cached(io_tree, start, end, &cached_state); 10911 + 10912 + btrfs_delalloc_release_extents(inode, num_bytes); 10913 + 10914 + if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, 10915 + ins.offset, pages, nr_pages, 0, NULL, 10916 + false)) { 10917 + btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); 10918 + ret = -EIO; 10919 + goto out_pages; 10920 + } 10921 + ret = orig_count; 10922 + goto out; 10923 + 10924 + out_free_reserved: 10925 + btrfs_dec_block_group_reservations(fs_info, ins.objectid); 10926 + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); 10927 + out_delalloc_release: 10928 + btrfs_delalloc_release_extents(inode, num_bytes); 10929 + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); 10930 + out_qgroup_free_data: 10931 + if (ret < 0) 10932 + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); 10933 + out_free_data_space: 10934 + /* 10935 + * If btrfs_reserve_extent() succeeded, then we already decremented 10936 + * bytes_may_use. 10937 + */ 10938 + if (!extent_reserved) 10939 + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); 10940 + out_unlock: 10941 + unlock_extent_cached(io_tree, start, end, &cached_state); 10942 + out_pages: 10943 + for (i = 0; i < nr_pages; i++) { 10944 + if (pages[i]) 10945 + __free_page(pages[i]); 10946 + } 10947 + kvfree(pages); 10948 + out: 10949 + if (ret >= 0) 10950 + iocb->ki_pos += encoded->len; 10951 + return ret; 10163 10952 } 10164 10953 10165 10954 #ifdef CONFIG_SWAP

+266 -43

fs/btrfs/ioctl.c

··· 28 28 #include <linux/iversion.h> 29 29 #include <linux/fileattr.h> 30 30 #include <linux/fsverity.h> 31 + #include <linux/sched/xacct.h> 31 32 #include "ctree.h" 32 33 #include "disk-io.h" 33 34 #include "export.h" ··· 89 88 90 89 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ 91 90 struct btrfs_ioctl_send_args_32) 91 + 92 + struct btrfs_ioctl_encoded_io_args_32 { 93 + compat_uptr_t iov; 94 + compat_ulong_t iovcnt; 95 + __s64 offset; 96 + __u64 flags; 97 + __u64 len; 98 + __u64 unencoded_len; 99 + __u64 unencoded_offset; 100 + __u32 compression; 101 + __u32 encryption; 102 + __u8 reserved[64]; 103 + }; 104 + 105 + #define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ 106 + struct btrfs_ioctl_encoded_io_args_32) 107 + #define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ 108 + struct btrfs_ioctl_encoded_io_args_32) 92 109 #endif 93 110 94 111 /* Mask out flags that are inappropriate for the given type of inode. */ ··· 459 440 } 460 441 } 461 442 462 - static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 443 + static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) 463 444 { 464 - struct inode *inode = file_inode(file); 465 - 466 445 return put_user(inode->i_generation, arg); 467 446 } 468 447 ··· 769 752 struct btrfs_pending_snapshot *pending_snapshot; 770 753 struct btrfs_trans_handle *trans; 771 754 int ret; 755 + 756 + /* We do not support snapshotting right now. */ 757 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 758 + btrfs_warn(fs_info, 759 + "extent tree v2 doesn't support snapshotting yet"); 760 + return -EOPNOTSUPP; 761 + } 772 762 773 763 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 774 764 return -EINVAL; ··· 1546 1522 } 1547 1523 1548 1524 #define CLUSTER_SIZE (SZ_256K) 1525 + static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 1549 1526 1550 1527 /* 1551 1528 * Defrag one contiguous target range. ··· 1692 1667 LIST_HEAD(target_list); 1693 1668 int ret; 1694 1669 1695 - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 1696 1670 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1697 1671 newer_than, do_compress, false, 1698 1672 &target_list, NULL); ··· 1833 1809 const unsigned long prev_sectors_defragged = sectors_defragged; 1834 1810 u64 last_scanned = cur; 1835 1811 u64 cluster_end; 1836 - 1837 - /* The cluster size 256K should always be page aligned */ 1838 - BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 1839 1812 1840 1813 if (btrfs_defrag_cancelled(fs_info)) { 1841 1814 ret = -EAGAIN; ··· 2250 2229 return ret; 2251 2230 } 2252 2231 2253 - static noinline int btrfs_ioctl_subvol_getflags(struct file *file, 2232 + static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, 2254 2233 void __user *arg) 2255 2234 { 2256 - struct inode *inode = file_inode(file); 2257 2235 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2258 2236 struct btrfs_root *root = BTRFS_I(inode)->root; 2259 2237 int ret = 0; ··· 2582 2562 return ret; 2583 2563 } 2584 2564 2585 - static noinline int btrfs_ioctl_tree_search(struct file *file, 2586 - void __user *argp) 2565 + static noinline int btrfs_ioctl_tree_search(struct inode *inode, 2566 + void __user *argp) 2587 2567 { 2588 2568 struct btrfs_ioctl_search_args __user *uargs; 2589 2569 struct btrfs_ioctl_search_key sk; 2590 - struct inode *inode; 2591 2570 int ret; 2592 2571 size_t buf_size; 2593 2572 ··· 2600 2581 2601 2582 buf_size = sizeof(uargs->buf); 2602 2583 2603 - inode = file_inode(file); 2604 2584 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); 2605 2585 2606 2586 /* ··· 2614 2596 return ret; 2615 2597 } 2616 2598 2617 - static noinline int btrfs_ioctl_tree_search_v2(struct file *file, 2599 + static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, 2618 2600 void __user *argp) 2619 2601 { 2620 2602 struct btrfs_ioctl_search_args_v2 __user *uarg; 2621 2603 struct btrfs_ioctl_search_args_v2 args; 2622 - struct inode *inode; 2623 2604 int ret; 2624 2605 size_t buf_size; 2625 2606 const size_t buf_limit = SZ_16M; ··· 2637 2620 if (buf_size > buf_limit) 2638 2621 buf_size = buf_limit; 2639 2622 2640 - inode = file_inode(file); 2641 2623 ret = search_ioctl(inode, &args.key, &buf_size, 2642 2624 (char __user *)(&uarg->buf[0])); 2643 2625 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ··· 2887 2871 return ret; 2888 2872 } 2889 2873 2890 - static noinline int btrfs_ioctl_ino_lookup(struct file *file, 2874 + static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, 2891 2875 void __user *argp) 2892 2876 { 2893 2877 struct btrfs_ioctl_ino_lookup_args *args; 2894 - struct inode *inode; 2895 2878 int ret = 0; 2896 2879 2897 2880 args = memdup_user(argp, sizeof(*args)); 2898 2881 if (IS_ERR(args)) 2899 2882 return PTR_ERR(args); 2900 2883 2901 - inode = file_inode(file); 2902 - 2903 2884 /* 2904 2885 * Unprivileged query to obtain the containing subvolume root id. The 2905 2886 * path is reset so it's consistent with btrfs_search_path_in_tree. 2906 2887 */ 2907 2888 if (args->treeid == 0) 2908 - args->treeid = BTRFS_I(inode)->root->root_key.objectid; 2889 + args->treeid = root->root_key.objectid; 2909 2890 2910 2891 if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { 2911 2892 args->name[0] = 0; ··· 2914 2901 goto out; 2915 2902 } 2916 2903 2917 - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, 2904 + ret = btrfs_search_path_in_tree(root->fs_info, 2918 2905 args->treeid, args->objectid, 2919 2906 args->name); 2920 2907 ··· 2970 2957 } 2971 2958 2972 2959 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ 2973 - static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) 2960 + static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) 2974 2961 { 2975 2962 struct btrfs_ioctl_get_subvol_info_args *subvol_info; 2976 2963 struct btrfs_fs_info *fs_info; ··· 2982 2969 struct extent_buffer *leaf; 2983 2970 unsigned long item_off; 2984 2971 unsigned long item_len; 2985 - struct inode *inode; 2986 2972 int slot; 2987 2973 int ret = 0; 2988 2974 ··· 2995 2983 return -ENOMEM; 2996 2984 } 2997 2985 2998 - inode = file_inode(file); 2999 2986 fs_info = BTRFS_I(inode)->root->fs_info; 3000 2987 3001 2988 /* Get root_item of inode's subvolume */ ··· 3088 3077 * Return ROOT_REF information of the subvolume containing this inode 3089 3078 * except the subvolume name. 3090 3079 */ 3091 - static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) 3080 + static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, 3081 + void __user *argp) 3092 3082 { 3093 3083 struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; 3094 3084 struct btrfs_root_ref *rref; 3095 - struct btrfs_root *root; 3096 3085 struct btrfs_path *path; 3097 3086 struct btrfs_key key; 3098 3087 struct extent_buffer *leaf; 3099 - struct inode *inode; 3100 3088 u64 objectid; 3101 3089 int slot; 3102 3090 int ret; ··· 3111 3101 return PTR_ERR(rootrefs); 3112 3102 } 3113 3103 3114 - inode = file_inode(file); 3115 - root = BTRFS_I(inode)->root->fs_info->tree_root; 3116 - objectid = BTRFS_I(inode)->root->root_key.objectid; 3117 - 3104 + objectid = root->root_key.objectid; 3118 3105 key.objectid = objectid; 3119 3106 key.type = BTRFS_ROOT_REF_KEY; 3120 3107 key.offset = rootrefs->min_treeid; 3121 3108 found = 0; 3122 3109 3110 + root = root->fs_info->tree_root; 3123 3111 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3124 3112 if (ret < 0) { 3125 3113 goto out; ··· 3196 3188 int subvol_namelen; 3197 3189 int err = 0; 3198 3190 bool destroy_parent = false; 3191 + 3192 + /* We don't support snapshots with extent tree v2 yet. */ 3193 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3194 + btrfs_err(fs_info, 3195 + "extent tree v2 doesn't support snapshot deletion yet"); 3196 + return -EOPNOTSUPP; 3197 + } 3199 3198 3200 3199 if (destroy_v2) { 3201 3200 vol_args2 = memdup_user(arg, sizeof(*vol_args2)); ··· 3478 3463 3479 3464 if (!capable(CAP_SYS_ADMIN)) 3480 3465 return -EPERM; 3466 + 3467 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3468 + btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); 3469 + return -EINVAL; 3470 + } 3481 3471 3482 3472 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { 3483 3473 if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) ··· 4009 3989 if (!capable(CAP_SYS_ADMIN)) 4010 3990 return -EPERM; 4011 3991 3992 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3993 + btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); 3994 + return -EINVAL; 3995 + } 3996 + 4012 3997 sa = memdup_user(arg, sizeof(*sa)); 4013 3998 if (IS_ERR(sa)) 4014 3999 return PTR_ERR(sa); ··· 4112 4087 4113 4088 if (!capable(CAP_SYS_ADMIN)) 4114 4089 return -EPERM; 4090 + 4091 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 4092 + btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); 4093 + return -EINVAL; 4094 + } 4115 4095 4116 4096 p = memdup_user(arg, sizeof(*p)); 4117 4097 if (IS_ERR(p)) ··· 5179 5149 return ret; 5180 5150 } 5181 5151 5182 - static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) 5152 + static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat) 5183 5153 { 5184 5154 struct btrfs_ioctl_send_args *arg; 5185 5155 int ret; ··· 5209 5179 if (IS_ERR(arg)) 5210 5180 return PTR_ERR(arg); 5211 5181 } 5212 - ret = btrfs_ioctl_send(file, arg); 5182 + ret = btrfs_ioctl_send(inode, arg); 5213 5183 kfree(arg); 5184 + return ret; 5185 + } 5186 + 5187 + static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, 5188 + bool compat) 5189 + { 5190 + struct btrfs_ioctl_encoded_io_args args = { 0 }; 5191 + size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, 5192 + flags); 5193 + size_t copy_end; 5194 + struct iovec iovstack[UIO_FASTIOV]; 5195 + struct iovec *iov = iovstack; 5196 + struct iov_iter iter; 5197 + loff_t pos; 5198 + struct kiocb kiocb; 5199 + ssize_t ret; 5200 + 5201 + if (!capable(CAP_SYS_ADMIN)) { 5202 + ret = -EPERM; 5203 + goto out_acct; 5204 + } 5205 + 5206 + if (compat) { 5207 + #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5208 + struct btrfs_ioctl_encoded_io_args_32 args32; 5209 + 5210 + copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, 5211 + flags); 5212 + if (copy_from_user(&args32, argp, copy_end)) { 5213 + ret = -EFAULT; 5214 + goto out_acct; 5215 + } 5216 + args.iov = compat_ptr(args32.iov); 5217 + args.iovcnt = args32.iovcnt; 5218 + args.offset = args32.offset; 5219 + args.flags = args32.flags; 5220 + #else 5221 + return -ENOTTY; 5222 + #endif 5223 + } else { 5224 + copy_end = copy_end_kernel; 5225 + if (copy_from_user(&args, argp, copy_end)) { 5226 + ret = -EFAULT; 5227 + goto out_acct; 5228 + } 5229 + } 5230 + if (args.flags != 0) { 5231 + ret = -EINVAL; 5232 + goto out_acct; 5233 + } 5234 + 5235 + ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), 5236 + &iov, &iter); 5237 + if (ret < 0) 5238 + goto out_acct; 5239 + 5240 + if (iov_iter_count(&iter) == 0) { 5241 + ret = 0; 5242 + goto out_iov; 5243 + } 5244 + pos = args.offset; 5245 + ret = rw_verify_area(READ, file, &pos, args.len); 5246 + if (ret < 0) 5247 + goto out_iov; 5248 + 5249 + init_sync_kiocb(&kiocb, file); 5250 + kiocb.ki_pos = pos; 5251 + 5252 + ret = btrfs_encoded_read(&kiocb, &iter, &args); 5253 + if (ret >= 0) { 5254 + fsnotify_access(file); 5255 + if (copy_to_user(argp + copy_end, 5256 + (char *)&args + copy_end_kernel, 5257 + sizeof(args) - copy_end_kernel)) 5258 + ret = -EFAULT; 5259 + } 5260 + 5261 + out_iov: 5262 + kfree(iov); 5263 + out_acct: 5264 + if (ret > 0) 5265 + add_rchar(current, ret); 5266 + inc_syscr(current); 5267 + return ret; 5268 + } 5269 + 5270 + static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) 5271 + { 5272 + struct btrfs_ioctl_encoded_io_args args; 5273 + struct iovec iovstack[UIO_FASTIOV]; 5274 + struct iovec *iov = iovstack; 5275 + struct iov_iter iter; 5276 + loff_t pos; 5277 + struct kiocb kiocb; 5278 + ssize_t ret; 5279 + 5280 + if (!capable(CAP_SYS_ADMIN)) { 5281 + ret = -EPERM; 5282 + goto out_acct; 5283 + } 5284 + 5285 + if (!(file->f_mode & FMODE_WRITE)) { 5286 + ret = -EBADF; 5287 + goto out_acct; 5288 + } 5289 + 5290 + if (compat) { 5291 + #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5292 + struct btrfs_ioctl_encoded_io_args_32 args32; 5293 + 5294 + if (copy_from_user(&args32, argp, sizeof(args32))) { 5295 + ret = -EFAULT; 5296 + goto out_acct; 5297 + } 5298 + args.iov = compat_ptr(args32.iov); 5299 + args.iovcnt = args32.iovcnt; 5300 + args.offset = args32.offset; 5301 + args.flags = args32.flags; 5302 + args.len = args32.len; 5303 + args.unencoded_len = args32.unencoded_len; 5304 + args.unencoded_offset = args32.unencoded_offset; 5305 + args.compression = args32.compression; 5306 + args.encryption = args32.encryption; 5307 + memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); 5308 + #else 5309 + return -ENOTTY; 5310 + #endif 5311 + } else { 5312 + if (copy_from_user(&args, argp, sizeof(args))) { 5313 + ret = -EFAULT; 5314 + goto out_acct; 5315 + } 5316 + } 5317 + 5318 + ret = -EINVAL; 5319 + if (args.flags != 0) 5320 + goto out_acct; 5321 + if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) 5322 + goto out_acct; 5323 + if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && 5324 + args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) 5325 + goto out_acct; 5326 + if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || 5327 + args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) 5328 + goto out_acct; 5329 + if (args.unencoded_offset > args.unencoded_len) 5330 + goto out_acct; 5331 + if (args.len > args.unencoded_len - args.unencoded_offset) 5332 + goto out_acct; 5333 + 5334 + ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), 5335 + &iov, &iter); 5336 + if (ret < 0) 5337 + goto out_acct; 5338 + 5339 + file_start_write(file); 5340 + 5341 + if (iov_iter_count(&iter) == 0) { 5342 + ret = 0; 5343 + goto out_end_write; 5344 + } 5345 + pos = args.offset; 5346 + ret = rw_verify_area(WRITE, file, &pos, args.len); 5347 + if (ret < 0) 5348 + goto out_end_write; 5349 + 5350 + init_sync_kiocb(&kiocb, file); 5351 + ret = kiocb_set_rw_flags(&kiocb, 0); 5352 + if (ret) 5353 + goto out_end_write; 5354 + kiocb.ki_pos = pos; 5355 + 5356 + ret = btrfs_do_write_iter(&kiocb, &iter, &args); 5357 + if (ret > 0) 5358 + fsnotify_modify(file); 5359 + 5360 + out_end_write: 5361 + file_end_write(file); 5362 + kfree(iov); 5363 + out_acct: 5364 + if (ret > 0) 5365 + add_wchar(current, ret); 5366 + inc_syscw(current); 5214 5367 return ret; 5215 5368 } 5216 5369 ··· 5407 5194 5408 5195 switch (cmd) { 5409 5196 case FS_IOC_GETVERSION: 5410 - return btrfs_ioctl_getversion(file, argp); 5197 + return btrfs_ioctl_getversion(inode, argp); 5411 5198 case FS_IOC_GETFSLABEL: 5412 5199 return btrfs_ioctl_get_fslabel(fs_info, argp); 5413 5200 case FS_IOC_SETFSLABEL: ··· 5427 5214 case BTRFS_IOC_SNAP_DESTROY_V2: 5428 5215 return btrfs_ioctl_snap_destroy(file, argp, true); 5429 5216 case BTRFS_IOC_SUBVOL_GETFLAGS: 5430 - return btrfs_ioctl_subvol_getflags(file, argp); 5217 + return btrfs_ioctl_subvol_getflags(inode, argp); 5431 5218 case BTRFS_IOC_SUBVOL_SETFLAGS: 5432 5219 return btrfs_ioctl_subvol_setflags(file, argp); 5433 5220 case BTRFS_IOC_DEFAULT_SUBVOL: ··· 5451 5238 case BTRFS_IOC_BALANCE: 5452 5239 return btrfs_ioctl_balance(file, NULL); 5453 5240 case BTRFS_IOC_TREE_SEARCH: 5454 - return btrfs_ioctl_tree_search(file, argp); 5241 + return btrfs_ioctl_tree_search(inode, argp); 5455 5242 case BTRFS_IOC_TREE_SEARCH_V2: 5456 - return btrfs_ioctl_tree_search_v2(file, argp); 5243 + return btrfs_ioctl_tree_search_v2(inode, argp); 5457 5244 case BTRFS_IOC_INO_LOOKUP: 5458 - return btrfs_ioctl_ino_lookup(file, argp); 5245 + return btrfs_ioctl_ino_lookup(root, argp); 5459 5246 case BTRFS_IOC_INO_PATHS: 5460 5247 return btrfs_ioctl_ino_to_path(root, argp); 5461 5248 case BTRFS_IOC_LOGICAL_INO: ··· 5502 5289 return btrfs_ioctl_set_received_subvol_32(file, argp); 5503 5290 #endif 5504 5291 case BTRFS_IOC_SEND: 5505 - return _btrfs_ioctl_send(file, argp, false); 5292 + return _btrfs_ioctl_send(inode, argp, false); 5506 5293 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5507 5294 case BTRFS_IOC_SEND_32: 5508 - return _btrfs_ioctl_send(file, argp, true); 5295 + return _btrfs_ioctl_send(inode, argp, true); 5509 5296 #endif 5510 5297 case BTRFS_IOC_GET_DEV_STATS: 5511 5298 return btrfs_ioctl_get_dev_stats(fs_info, argp); ··· 5532 5319 case BTRFS_IOC_SET_FEATURES: 5533 5320 return btrfs_ioctl_set_features(file, argp); 5534 5321 case BTRFS_IOC_GET_SUBVOL_INFO: 5535 - return btrfs_ioctl_get_subvol_info(file, argp); 5322 + return btrfs_ioctl_get_subvol_info(inode, argp); 5536 5323 case BTRFS_IOC_GET_SUBVOL_ROOTREF: 5537 - return btrfs_ioctl_get_subvol_rootref(file, argp); 5324 + return btrfs_ioctl_get_subvol_rootref(root, argp); 5538 5325 case BTRFS_IOC_INO_LOOKUP_USER: 5539 5326 return btrfs_ioctl_ino_lookup_user(file, argp); 5540 5327 case FS_IOC_ENABLE_VERITY: 5541 5328 return fsverity_ioctl_enable(file, (const void __user *)argp); 5542 5329 case FS_IOC_MEASURE_VERITY: 5543 5330 return fsverity_ioctl_measure(file, argp); 5331 + case BTRFS_IOC_ENCODED_READ: 5332 + return btrfs_ioctl_encoded_read(file, argp, false); 5333 + case BTRFS_IOC_ENCODED_WRITE: 5334 + return btrfs_ioctl_encoded_write(file, argp, false); 5335 + #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) 5336 + case BTRFS_IOC_ENCODED_READ_32: 5337 + return btrfs_ioctl_encoded_read(file, argp, true); 5338 + case BTRFS_IOC_ENCODED_WRITE_32: 5339 + return btrfs_ioctl_encoded_write(file, argp, true); 5340 + #endif 5544 5341 } 5545 5342 5546 5343 return -ENOTTY;

+7 -4

fs/btrfs/lzo.c

··· 55 55 * 0x1000 | SegHdr N+1| Data payload N+1 ... | 56 56 */ 57 57 58 + #define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) 59 + #define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE)) 60 + 58 61 struct workspace { 59 62 void *mem; 60 63 void *buf; /* where decompressed data goes */ ··· 86 83 return ERR_PTR(-ENOMEM); 87 84 88 85 workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); 89 - workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); 90 - workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL); 86 + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); 87 + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); 91 88 if (!workspace->mem || !workspace->buf || !workspace->cbuf) 92 89 goto fail; 93 90 ··· 383 380 kunmap(cur_page); 384 381 cur_in += LZO_LEN; 385 382 386 - if (seg_len > lzo1x_worst_compress(PAGE_SIZE)) { 383 + if (seg_len > WORKSPACE_CBUF_LENGTH) { 387 384 /* 388 385 * seg_len shouldn't be larger than we have allocated 389 386 * for workspace->cbuf ··· 436 433 struct workspace *workspace = list_entry(ws, struct workspace, list); 437 434 size_t in_len; 438 435 size_t out_len; 439 - size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); 436 + size_t max_segment_len = WORKSPACE_BUF_LENGTH; 440 437 int ret = 0; 441 438 char *kaddr; 442 439 unsigned long bytes;

+44 -88

fs/btrfs/ordered-data.c

··· 143 143 return ret; 144 144 } 145 145 146 - /* 147 - * Allocate and add a new ordered_extent into the per-inode tree. 146 + /** 147 + * Add an ordered extent to the per-inode tree. 148 148 * 149 - * The tree is given a single reference on the ordered extent that was 150 - * inserted. 149 + * @inode: Inode that this extent is for. 150 + * @file_offset: Logical offset in file where the extent starts. 151 + * @num_bytes: Logical length of extent in file. 152 + * @ram_bytes: Full length of unencoded data. 153 + * @disk_bytenr: Offset of extent on disk. 154 + * @disk_num_bytes: Size of extent on disk. 155 + * @offset: Offset into unencoded data where file data starts. 156 + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). 157 + * @compress_type: Compression algorithm used for data. 158 + * 159 + * Most of these parameters correspond to &struct btrfs_file_extent_item. The 160 + * tree is given a single reference on the ordered extent that was inserted. 161 + * 162 + * Return: 0 or -ENOMEM. 151 163 */ 152 - static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, 153 - u64 disk_bytenr, u64 num_bytes, 154 - u64 disk_num_bytes, int type, int dio, 155 - int compress_type) 164 + int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, 165 + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, 166 + u64 disk_num_bytes, u64 offset, unsigned flags, 167 + int compress_type) 156 168 { 157 169 struct btrfs_root *root = inode->root; 158 170 struct btrfs_fs_info *fs_info = root->fs_info; ··· 173 161 struct btrfs_ordered_extent *entry; 174 162 int ret; 175 163 176 - if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { 164 + if (flags & 165 + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { 177 166 /* For nocow write, we can release the qgroup rsv right now */ 178 167 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); 179 168 if (ret < 0) ··· 194 181 return -ENOMEM; 195 182 196 183 entry->file_offset = file_offset; 197 - entry->disk_bytenr = disk_bytenr; 198 184 entry->num_bytes = num_bytes; 185 + entry->ram_bytes = ram_bytes; 186 + entry->disk_bytenr = disk_bytenr; 199 187 entry->disk_num_bytes = disk_num_bytes; 188 + entry->offset = offset; 200 189 entry->bytes_left = num_bytes; 201 190 entry->inode = igrab(&inode->vfs_inode); 202 191 entry->compress_type = compress_type; ··· 206 191 entry->qgroup_rsv = ret; 207 192 entry->physical = (u64)-1; 208 193 209 - ASSERT(type == BTRFS_ORDERED_REGULAR || 210 - type == BTRFS_ORDERED_NOCOW || 211 - type == BTRFS_ORDERED_PREALLOC || 212 - type == BTRFS_ORDERED_COMPRESSED); 213 - set_bit(type, &entry->flags); 194 + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); 195 + entry->flags = flags; 214 196 215 197 percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, 216 198 fs_info->delalloc_batch); 217 - 218 - if (dio) 219 - set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); 220 199 221 200 /* one ref for the tree */ 222 201 refcount_set(&entry->refs, 1); ··· 254 245 spin_unlock(&inode->lock); 255 246 256 247 return 0; 257 - } 258 - 259 - int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, 260 - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, 261 - int type) 262 - { 263 - ASSERT(type == BTRFS_ORDERED_REGULAR || 264 - type == BTRFS_ORDERED_NOCOW || 265 - type == BTRFS_ORDERED_PREALLOC); 266 - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 267 - num_bytes, disk_num_bytes, type, 0, 268 - BTRFS_COMPRESS_NONE); 269 - } 270 - 271 - int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, 272 - u64 disk_bytenr, u64 num_bytes, 273 - u64 disk_num_bytes, int type) 274 - { 275 - ASSERT(type == BTRFS_ORDERED_REGULAR || 276 - type == BTRFS_ORDERED_NOCOW || 277 - type == BTRFS_ORDERED_PREALLOC); 278 - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 279 - num_bytes, disk_num_bytes, type, 1, 280 - BTRFS_COMPRESS_NONE); 281 - } 282 - 283 - int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, 284 - u64 disk_bytenr, u64 num_bytes, 285 - u64 disk_num_bytes, int compress_type) 286 - { 287 - ASSERT(compress_type != BTRFS_COMPRESS_NONE); 288 - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 289 - num_bytes, disk_num_bytes, 290 - BTRFS_ORDERED_COMPRESSED, 0, 291 - compress_type); 292 248 } 293 249 294 250 /* ··· 522 548 spin_lock(&btrfs_inode->lock); 523 549 btrfs_mod_outstanding_extents(btrfs_inode, -1); 524 550 spin_unlock(&btrfs_inode->lock); 525 - if (root != fs_info->tree_root) 526 - btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, 527 - false); 551 + if (root != fs_info->tree_root) { 552 + u64 release; 553 + 554 + if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) 555 + release = entry->disk_num_bytes; 556 + else 557 + release = entry->num_bytes; 558 + btrfs_delalloc_release_metadata(btrfs_inode, release, false); 559 + } 528 560 529 561 percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, 530 562 fs_info->delalloc_batch); ··· 1032 1052 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 1033 1053 u64 file_offset = ordered->file_offset + pos; 1034 1054 u64 disk_bytenr = ordered->disk_bytenr + pos; 1035 - u64 num_bytes = len; 1036 - u64 disk_num_bytes = len; 1037 - int type; 1038 - unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); 1039 - int compress_type = ordered->compress_type; 1040 - unsigned long weight; 1041 - int ret; 1042 - 1043 - weight = hweight_long(flags_masked); 1044 - WARN_ON_ONCE(weight > 1); 1045 - if (!weight) 1046 - type = 0; 1047 - else 1048 - type = __ffs(flags_masked); 1055 + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; 1049 1056 1050 1057 /* 1051 - * The splitting extent is already counted and will be added again 1052 - * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid 1053 - * double counting. 1058 + * The splitting extent is already counted and will be added again in 1059 + * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. 1054 1060 */ 1055 - percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, 1061 + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, 1056 1062 fs_info->delalloc_batch); 1057 - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { 1058 - WARN_ON_ONCE(1); 1059 - ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), 1060 - file_offset, disk_bytenr, num_bytes, 1061 - disk_num_bytes, compress_type); 1062 - } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 1063 - ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, 1064 - disk_bytenr, num_bytes, disk_num_bytes, type); 1065 - } else { 1066 - ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, 1067 - disk_bytenr, num_bytes, disk_num_bytes, type); 1068 - } 1069 - 1070 - return ret; 1063 + WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); 1064 + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, 1065 + disk_bytenr, len, 0, flags, 1066 + ordered->compress_type); 1071 1067 } 1072 1068 1073 1069 int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,

+16 -9

fs/btrfs/ordered-data.h

··· 74 74 BTRFS_ORDERED_LOGGED_CSUM, 75 75 /* We wait for this extent to complete in the current transaction */ 76 76 BTRFS_ORDERED_PENDING, 77 + /* BTRFS_IOC_ENCODED_WRITE */ 78 + BTRFS_ORDERED_ENCODED, 77 79 }; 80 + 81 + /* BTRFS_ORDERED_* flags that specify the type of the extent. */ 82 + #define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ 83 + (1UL << BTRFS_ORDERED_NOCOW) | \ 84 + (1UL << BTRFS_ORDERED_PREALLOC) | \ 85 + (1UL << BTRFS_ORDERED_COMPRESSED) | \ 86 + (1UL << BTRFS_ORDERED_DIRECT) | \ 87 + (1UL << BTRFS_ORDERED_ENCODED)) 78 88 79 89 struct btrfs_ordered_extent { 80 90 /* logical offset in the file */ ··· 94 84 * These fields directly correspond to the same fields in 95 85 * btrfs_file_extent_item. 96 86 */ 97 - u64 disk_bytenr; 98 87 u64 num_bytes; 88 + u64 ram_bytes; 89 + u64 disk_bytenr; 99 90 u64 disk_num_bytes; 91 + u64 offset; 100 92 101 93 /* number of bytes that still need writing */ 102 94 u64 bytes_left; ··· 191 179 struct btrfs_ordered_extent **cached, 192 180 u64 file_offset, u64 io_size); 193 181 int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, 194 - u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, 195 - int type); 196 - int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, 197 - u64 disk_bytenr, u64 num_bytes, 198 - u64 disk_num_bytes, int type); 199 - int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, 200 - u64 disk_bytenr, u64 num_bytes, 201 - u64 disk_num_bytes, int compress_type); 182 + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, 183 + u64 disk_num_bytes, u64 offset, unsigned flags, 184 + int compress_type); 202 185 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 203 186 struct btrfs_ordered_sum *sum); 204 187 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,

+3 -2

fs/btrfs/print-tree.c

··· 23 23 { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, 24 24 { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, 25 25 { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, 26 + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, 26 27 { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, 27 28 }; 28 29 ··· 392 391 btrfs_header_owner(c), 393 392 btrfs_node_ptr_generation(c, i), 394 393 level - 1, &first_key); 395 - if (IS_ERR(next)) { 394 + if (IS_ERR(next)) 396 395 continue; 397 - } else if (!extent_buffer_uptodate(next)) { 396 + if (!extent_buffer_uptodate(next)) { 398 397 free_extent_buffer(next); 399 398 continue; 400 399 }

+44 -28

fs/btrfs/qgroup.c

··· 25 25 #include "sysfs.h" 26 26 #include "tree-mod-log.h" 27 27 28 - /* TODO XXX FIXME 29 - * - subvol delete -> delete when ref goes to 0? delete limits also? 30 - * - reorganize keys 31 - * - compressed 32 - * - sync 33 - * - copy also limits on subvol creation 34 - * - limit 35 - * - caches for ulists 36 - * - performance benchmarks 37 - * - check all ioctl parameters 38 - */ 39 - 40 28 /* 41 29 * Helpers to access qgroup reservation 42 30 * ··· 246 258 return 0; 247 259 } 248 260 249 - /* must be called with qgroup_lock held */ 250 - static int add_relation_rb(struct btrfs_fs_info *fs_info, 251 - u64 memberid, u64 parentid) 261 + /* 262 + * Add relation specified by two qgroups. 263 + * 264 + * Must be called with qgroup_lock held. 265 + * 266 + * Return: 0 on success 267 + * -ENOENT if one of the qgroups is NULL 268 + * <0 other errors 269 + */ 270 + static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) 252 271 { 253 - struct btrfs_qgroup *member; 254 - struct btrfs_qgroup *parent; 255 272 struct btrfs_qgroup_list *list; 256 273 257 - member = find_qgroup_rb(fs_info, memberid); 258 - parent = find_qgroup_rb(fs_info, parentid); 259 274 if (!member || !parent) 260 275 return -ENOENT; 261 276 ··· 274 283 return 0; 275 284 } 276 285 277 - /* must be called with qgroup_lock held */ 286 + /* 287 + * Add relation specified by two qgoup ids. 288 + * 289 + * Must be called with qgroup_lock held. 290 + * 291 + * Return: 0 on success 292 + * -ENOENT if one of the ids does not exist 293 + * <0 other errors 294 + */ 295 + static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) 296 + { 297 + struct btrfs_qgroup *member; 298 + struct btrfs_qgroup *parent; 299 + 300 + member = find_qgroup_rb(fs_info, memberid); 301 + parent = find_qgroup_rb(fs_info, parentid); 302 + 303 + return __add_relation_rb(member, parent); 304 + } 305 + 306 + /* Must be called with qgroup_lock held */ 278 307 static int del_relation_rb(struct btrfs_fs_info *fs_info, 279 308 u64 memberid, u64 parentid) 280 309 { ··· 959 948 */ 960 949 lockdep_assert_held_write(&fs_info->subvol_sem); 961 950 951 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 952 + btrfs_err(fs_info, 953 + "qgroups are currently unsupported in extent tree v2"); 954 + return -EINVAL; 955 + } 956 + 962 957 mutex_lock(&fs_info->qgroup_ioctl_lock); 963 958 if (fs_info->quota_root) 964 959 goto out; ··· 1468 1451 } 1469 1452 1470 1453 spin_lock(&fs_info->qgroup_lock); 1471 - ret = add_relation_rb(fs_info, src, dst); 1454 + ret = __add_relation_rb(member, parent); 1472 1455 if (ret < 0) { 1473 1456 spin_unlock(&fs_info->qgroup_lock); 1474 1457 goto out; ··· 3285 3268 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3286 3269 { 3287 3270 return btrfs_fs_closing(fs_info) || 3288 - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); 3271 + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || 3272 + !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 3289 3273 } 3290 3274 3291 3275 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) ··· 3316 3298 err = PTR_ERR(trans); 3317 3299 break; 3318 3300 } 3319 - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3320 - err = -EINTR; 3321 - } else { 3322 - err = qgroup_rescan_leaf(trans, path); 3323 - } 3301 + 3302 + err = qgroup_rescan_leaf(trans, path); 3303 + 3324 3304 if (err > 0) 3325 3305 btrfs_commit_transaction(trans); 3326 3306 else ··· 3332 3316 if (err > 0 && 3333 3317 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3334 3318 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3335 - } else if (err < 0) { 3319 + } else if (err < 0 || stopped) { 3336 3320 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3337 3321 } 3338 3322 mutex_unlock(&fs_info->qgroup_rescan_lock);

+25 -18

fs/btrfs/reflink.c

··· 277 277 path->slots[0]), 278 278 size); 279 279 btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); 280 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags); 280 + btrfs_set_inode_full_sync(BTRFS_I(dst)); 281 281 ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); 282 282 out: 283 283 if (!ret && !trans) { ··· 494 494 &clone_info, &trans); 495 495 if (ret) 496 496 goto out; 497 - } else if (type == BTRFS_FILE_EXTENT_INLINE) { 497 + } else { 498 + ASSERT(type == BTRFS_FILE_EXTENT_INLINE); 498 499 /* 499 500 * Inline extents always have to start at file offset 0 500 501 * and can never be bigger then the sector size. We can ··· 506 505 */ 507 506 ASSERT(key.offset == 0); 508 507 ASSERT(datal <= fs_info->sectorsize); 509 - if (key.offset != 0 || datal > fs_info->sectorsize) 510 - return -EUCLEAN; 508 + if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || 509 + WARN_ON(key.offset != 0) || 510 + WARN_ON(datal > fs_info->sectorsize)) { 511 + ret = -EUCLEAN; 512 + goto out; 513 + } 511 514 512 515 ret = clone_copy_inline_extent(inode, path, &new_key, 513 516 drop_start, datal, size, ··· 523 518 btrfs_release_path(path); 524 519 525 520 /* 526 - * If this is a new extent update the last_reflink_trans of both 527 - * inodes. This is used by fsync to make sure it does not log 528 - * multiple checksum items with overlapping ranges. For older 529 - * extents we don't need to do it since inode logging skips the 530 - * checksums for older extents. Also ignore holes and inline 531 - * extents because they don't have checksums in the csum tree. 521 + * Whenever we share an extent we update the last_reflink_trans 522 + * of each inode to the current transaction. This is needed to 523 + * make sure fsync does not log multiple checksum items with 524 + * overlapping ranges (because some extent items might refer 525 + * only to sections of the original extent). For the destination 526 + * inode we do this regardless of the generation of the extents 527 + * or even if they are inline extents or explicit holes, to make 528 + * sure a full fsync does not skip them. For the source inode, 529 + * we only need to update last_reflink_trans in case it's a new 530 + * extent that is not a hole or an inline extent, to deal with 531 + * the checksums problem on fsync. 532 532 */ 533 - if (extent_gen == trans->transid && disko > 0) { 533 + if (extent_gen == trans->transid && disko > 0) 534 534 BTRFS_I(src)->last_reflink_trans = trans->transid; 535 - BTRFS_I(inode)->last_reflink_trans = trans->transid; 536 - } 535 + 536 + BTRFS_I(inode)->last_reflink_trans = trans->transid; 537 537 538 538 last_dest_end = ALIGN(new_key.offset + datal, 539 539 fs_info->sectorsize); ··· 585 575 * replaced file extent items. 586 576 */ 587 577 if (last_dest_end >= i_size_read(inode)) 588 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 589 - &BTRFS_I(inode)->runtime_flags); 578 + btrfs_set_inode_full_sync(BTRFS_I(inode)); 590 579 591 580 ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 592 581 last_dest_end, destoff + len - 1, NULL, &trans); ··· 781 772 if (btrfs_root_readonly(root_out)) 782 773 return -EROFS; 783 774 784 - if (file_in->f_path.mnt != file_out->f_path.mnt || 785 - inode_in->i_sb != inode_out->i_sb) 786 - return -EXDEV; 775 + ASSERT(inode_in->i_sb == inode_out->i_sb); 787 776 } 788 777 789 778 /* Don't make the dst file partly checksummed */

+5 -6

fs/btrfs/relocation.c

··· 2599 2599 2600 2600 eb = read_tree_block(fs_info, block->bytenr, block->owner, 2601 2601 block->key.offset, block->level, NULL); 2602 - if (IS_ERR(eb)) { 2602 + if (IS_ERR(eb)) 2603 2603 return PTR_ERR(eb); 2604 - } else if (!extent_buffer_uptodate(eb)) { 2604 + if (!extent_buffer_uptodate(eb)) { 2605 2605 free_extent_buffer(eb); 2606 2606 return -EIO; 2607 2607 } ··· 2997 2997 2998 2998 /* Reserve metadata for this range */ 2999 2999 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3000 - clamped_len); 3000 + clamped_len, clamped_len); 3001 3001 if (ret) 3002 3002 goto release_page; 3003 3003 ··· 4123 4123 * this function resumes merging reloc trees with corresponding fs trees. 4124 4124 * this is important for keeping the sharing of tree blocks 4125 4125 */ 4126 - int btrfs_recover_relocation(struct btrfs_root *root) 4126 + int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4127 4127 { 4128 - struct btrfs_fs_info *fs_info = root->fs_info; 4129 4128 LIST_HEAD(reloc_roots); 4130 4129 struct btrfs_key key; 4131 4130 struct btrfs_root *fs_root; ··· 4165 4166 key.type != BTRFS_ROOT_ITEM_KEY) 4166 4167 break; 4167 4168 4168 - reloc_root = btrfs_read_tree_root(root, &key); 4169 + reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4169 4170 if (IS_ERR(reloc_root)) { 4170 4171 err = PTR_ERR(reloc_root); 4171 4172 goto out;

+1 -1

fs/btrfs/scrub.c

··· 3190 3190 u64 generation; 3191 3191 int mirror_num; 3192 3192 struct btrfs_key key; 3193 - u64 increment = map->stripe_len; 3193 + u64 increment; 3194 3194 u64 offset; 3195 3195 u64 extent_logical; 3196 3196 u64 extent_physical;

+3 -8

fs/btrfs/send.c

··· 528 528 529 529 static int fs_path_copy(struct fs_path *p, struct fs_path *from) 530 530 { 531 - int ret; 532 - 533 531 p->reversed = from->reversed; 534 532 fs_path_reset(p); 535 533 536 - ret = fs_path_add_path(p, from); 537 - 538 - return ret; 534 + return fs_path_add_path(p, from); 539 535 } 540 - 541 536 542 537 static void fs_path_unreverse(struct fs_path *p) 543 538 { ··· 7472 7477 root->root_key.objectid, root->dedupe_in_progress); 7473 7478 } 7474 7479 7475 - long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) 7480 + long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) 7476 7481 { 7477 7482 int ret = 0; 7478 - struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; 7483 + struct btrfs_root *send_root = BTRFS_I(inode)->root; 7479 7484 struct btrfs_fs_info *fs_info = send_root->fs_info; 7480 7485 struct btrfs_root *clone_root; 7481 7486 struct send_ctx *sctx = NULL;

+1 -1

fs/btrfs/send.h

··· 126 126 #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) 127 127 128 128 #ifdef __KERNEL__ 129 - long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); 129 + long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); 130 130 #endif 131 131 132 132 #endif

+4 -1

fs/btrfs/space-info.c

··· 737 737 u64 thresh = div_factor_fine(space_info->total_bytes, 90); 738 738 u64 used; 739 739 740 + lockdep_assert_held(&space_info->lock); 741 + 740 742 /* If we're just plain full then async reclaim just slows us down. */ 741 743 if ((space_info->bytes_used + space_info->bytes_reserved + 742 744 global_rsv_size) >= thresh) ··· 1063 1061 trans_rsv->reserved; 1064 1062 if (block_rsv_size < space_info->bytes_may_use) 1065 1063 delalloc_size = space_info->bytes_may_use - block_rsv_size; 1066 - spin_unlock(&space_info->lock); 1067 1064 1068 1065 /* 1069 1066 * We don't want to include the global_rsv in our calculation, ··· 1092 1091 to_reclaim = delayed_refs_rsv->reserved; 1093 1092 flush = FLUSH_DELAYED_REFS_NR; 1094 1093 } 1094 + 1095 + spin_unlock(&space_info->lock); 1095 1096 1096 1097 /* 1097 1098 * We don't want to reclaim everything, just a portion, so scale

+87 -9

fs/btrfs/super.c

··· 66 66 67 67 static int btrfs_remount(struct super_block *sb, int *flags, char *data); 68 68 69 + #ifdef CONFIG_PRINTK 70 + 71 + #define STATE_STRING_PREFACE ": state " 72 + #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) 73 + 74 + /* 75 + * Characters to print to indicate error conditions or uncommon filesystem sate. 76 + * RO is not an error. 77 + */ 78 + static const char fs_state_chars[] = { 79 + [BTRFS_FS_STATE_ERROR] = 'E', 80 + [BTRFS_FS_STATE_REMOUNTING] = 'M', 81 + [BTRFS_FS_STATE_RO] = 0, 82 + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', 83 + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', 84 + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, 85 + [BTRFS_FS_STATE_NO_CSUMS] = 'C', 86 + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', 87 + }; 88 + 89 + static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) 90 + { 91 + unsigned int bit; 92 + bool states_printed = false; 93 + unsigned long fs_state = READ_ONCE(info->fs_state); 94 + char *curr = buf; 95 + 96 + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); 97 + curr += sizeof(STATE_STRING_PREFACE) - 1; 98 + 99 + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { 100 + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); 101 + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { 102 + *curr++ = fs_state_chars[bit]; 103 + states_printed = true; 104 + } 105 + } 106 + 107 + /* If no states were printed, reset the buffer */ 108 + if (!states_printed) 109 + curr = buf; 110 + 111 + *curr++ = 0; 112 + } 113 + #endif 114 + 69 115 /* 70 116 * Generally the error codes correspond to their respective errors, but there 71 117 * are a few special cases. ··· 174 128 { 175 129 struct super_block *sb = fs_info->sb; 176 130 #ifdef CONFIG_PRINTK 131 + char statestr[STATE_STRING_BUF_LEN]; 177 132 const char *errstr; 178 133 #endif 179 134 ··· 187 140 188 141 #ifdef CONFIG_PRINTK 189 142 errstr = btrfs_decode_error(errno); 143 + btrfs_state_to_string(fs_info, statestr); 190 144 if (fmt) { 191 145 struct va_format vaf; 192 146 va_list args; ··· 196 148 vaf.fmt = fmt; 197 149 vaf.va = &args; 198 150 199 - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n", 200 - sb->s_id, function, line, errno, errstr, &vaf); 151 + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", 152 + sb->s_id, statestr, function, line, errno, errstr, &vaf); 201 153 va_end(args); 202 154 } else { 203 - pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n", 204 - sb->s_id, function, line, errno, errstr); 155 + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", 156 + sb->s_id, statestr, function, line, errno, errstr); 205 157 } 206 158 #endif 207 159 ··· 288 240 vaf.va = &args; 289 241 290 242 if (__ratelimit(ratelimit)) { 291 - if (fs_info) 292 - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, 293 - fs_info->sb->s_id, &vaf); 294 - else 243 + if (fs_info) { 244 + char statestr[STATE_STRING_BUF_LEN]; 245 + 246 + btrfs_state_to_string(fs_info, statestr); 247 + printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, 248 + fs_info->sb->s_id, statestr, &vaf); 249 + } else { 295 250 printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); 251 + } 296 252 } 297 253 298 254 va_end(args); ··· 913 861 break; 914 862 case Opt_space_cache: 915 863 case Opt_space_cache_version: 864 + /* 865 + * We already set FREE_SPACE_TREE above because we have 866 + * compat_ro(FREE_SPACE_TREE) set, and we aren't going 867 + * to allow v1 to be set for extent tree v2, simply 868 + * ignore this setting if we're extent tree v2. 869 + */ 870 + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) 871 + break; 916 872 if (token == Opt_space_cache || 917 873 strcmp(args[0].from, "v1") == 0) { 918 874 btrfs_clear_opt(info->mount_opt, ··· 941 881 btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); 942 882 break; 943 883 case Opt_no_space_cache: 884 + /* 885 + * We cannot operate without the free space tree with 886 + * extent tree v2, ignore this option. 887 + */ 888 + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) 889 + break; 944 890 if (btrfs_test_opt(info, SPACE_CACHE)) { 945 891 btrfs_clear_and_info(info, SPACE_CACHE, 946 892 "disabling disk space caching"); ··· 962 896 "the 'inode_cache' option is deprecated and has no effect since 5.11"); 963 897 break; 964 898 case Opt_clear_cache: 899 + /* 900 + * We cannot clear the free space tree with extent tree 901 + * v2, ignore this option. 902 + */ 903 + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) 904 + break; 965 905 btrfs_set_and_info(info, CLEAR_CACHE, 966 906 "force clearing of disk cache"); 967 907 break; ··· 2455 2383 { 2456 2384 struct btrfs_ioctl_vol_args *vol; 2457 2385 struct btrfs_device *device = NULL; 2386 + dev_t devt = 0; 2458 2387 int ret = -ENOTTY; 2459 2388 2460 2389 if (!capable(CAP_SYS_ADMIN)) ··· 2475 2402 mutex_unlock(&uuid_mutex); 2476 2403 break; 2477 2404 case BTRFS_IOC_FORGET_DEV: 2478 - ret = btrfs_forget_devices(vol->name); 2405 + if (vol->name[0] != 0) { 2406 + ret = lookup_bdev(vol->name, &devt); 2407 + if (ret) 2408 + break; 2409 + } 2410 + ret = btrfs_forget_devices(devt); 2479 2411 break; 2480 2412 case BTRFS_IOC_DEVICES_READY: 2481 2413 mutex_lock(&uuid_mutex);

+9 -6

fs/btrfs/sysfs.c

··· 283 283 BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); 284 284 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); 285 285 BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); 286 - /* Remove once support for zoned allocation is feature complete */ 287 286 #ifdef CONFIG_BTRFS_DEBUG 287 + /* Remove once support for zoned allocation is feature complete */ 288 288 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); 289 + /* Remove once support for extent tree v2 is feature complete */ 290 + BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); 289 291 #endif 290 292 #ifdef CONFIG_FS_VERITY 291 293 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); ··· 316 314 BTRFS_FEAT_ATTR_PTR(raid1c34), 317 315 #ifdef CONFIG_BTRFS_DEBUG 318 316 BTRFS_FEAT_ATTR_PTR(zoned), 317 + BTRFS_FEAT_ATTR_PTR(extent_tree_v2), 319 318 #endif 320 319 #ifdef CONFIG_FS_VERITY 321 320 BTRFS_FEAT_ATTR_PTR(verity), ··· 1107 1104 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; 1108 1105 static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS]; 1109 1106 1107 + static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) == 1108 + ARRAY_SIZE(btrfs_feature_attrs)); 1109 + static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) == 1110 + ARRAY_SIZE(btrfs_feature_attrs[0])); 1111 + 1110 1112 static const u64 supported_feature_masks[FEAT_MAX] = { 1111 1113 [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, 1112 1114 [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, ··· 1279 1271 { 1280 1272 struct btrfs_feature_attr *fa; 1281 1273 int set, i; 1282 - 1283 - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) != 1284 - ARRAY_SIZE(btrfs_feature_attrs)); 1285 - BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) != 1286 - ARRAY_SIZE(btrfs_feature_attrs[0])); 1287 1274 1288 1275 memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs)); 1289 1276 memset(btrfs_unknown_feature_names, 0,

+2

fs/btrfs/tests/extent-map-tests.c

··· 15 15 struct extent_map *em; 16 16 struct rb_node *node; 17 17 18 + write_lock(&em_tree->lock); 18 19 while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { 19 20 node = rb_first_cached(&em_tree->map); 20 21 em = rb_entry(node, struct extent_map, rb_node); ··· 33 32 #endif 34 33 free_extent_map(em); 35 34 } 35 + write_unlock(&em_tree->lock); 36 36 } 37 37 38 38 /*

+17 -2

fs/btrfs/transaction.c

··· 1911 1911 super->cache_generation = 0; 1912 1912 if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) 1913 1913 super->uuid_tree_generation = root_item->generation; 1914 + 1915 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 1916 + root_item = &fs_info->block_group_root->root_item; 1917 + 1918 + super->block_group_root = root_item->bytenr; 1919 + super->block_group_root_generation = root_item->generation; 1920 + super->block_group_root_level = root_item->level; 1921 + } 1914 1922 } 1915 1923 1916 1924 int btrfs_transaction_in_commit(struct btrfs_fs_info *info) ··· 2370 2362 list_add_tail(&fs_info->chunk_root->dirty_list, 2371 2363 &cur_trans->switch_commits); 2372 2364 2365 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2366 + btrfs_set_root_node(&fs_info->block_group_root->root_item, 2367 + fs_info->block_group_root->node); 2368 + list_add_tail(&fs_info->block_group_root->dirty_list, 2369 + &cur_trans->switch_commits); 2370 + } 2371 + 2373 2372 switch_commit_roots(trans); 2374 2373 2375 2374 ASSERT(list_empty(&cur_trans->dirty_bgs)); ··· 2505 2490 * because btrfs_commit_super will poke cleaner thread and it will process it a 2506 2491 * few seconds later. 2507 2492 */ 2508 - int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) 2493 + int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) 2509 2494 { 2495 + struct btrfs_root *root; 2510 2496 int ret; 2511 - struct btrfs_fs_info *fs_info = root->fs_info; 2512 2497 2513 2498 spin_lock(&fs_info->trans_lock); 2514 2499 if (list_empty(&fs_info->dead_roots)) {

+1 -1

fs/btrfs/transaction.h

··· 217 217 void btrfs_add_dead_root(struct btrfs_root *root); 218 218 int btrfs_defrag_root(struct btrfs_root *root); 219 219 void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); 220 - int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); 220 + int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); 221 221 int btrfs_commit_transaction(struct btrfs_trans_handle *trans); 222 222 void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); 223 223 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);

+32 -3

fs/btrfs/tree-checker.c

··· 639 639 static int check_block_group_item(struct extent_buffer *leaf, 640 640 struct btrfs_key *key, int slot) 641 641 { 642 + struct btrfs_fs_info *fs_info = leaf->fs_info; 642 643 struct btrfs_block_group_item bgi; 643 644 u32 item_size = btrfs_item_size(leaf, slot); 645 + u64 chunk_objectid; 644 646 u64 flags; 645 647 u64 type; 646 648 ··· 665 663 666 664 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 667 665 sizeof(bgi)); 668 - if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) != 669 - BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { 666 + chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi); 667 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 668 + /* 669 + * We don't init the nr_global_roots until we load the global 670 + * roots, so this could be 0 at mount time. If it's 0 we'll 671 + * just assume we're fine, and later we'll check against our 672 + * actual value. 673 + */ 674 + if (unlikely(fs_info->nr_global_roots && 675 + chunk_objectid >= fs_info->nr_global_roots)) { 676 + block_group_err(leaf, slot, 677 + "invalid block group global root id, have %llu, needs to be <= %llu", 678 + chunk_objectid, 679 + fs_info->nr_global_roots); 680 + return -EUCLEAN; 681 + } 682 + } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { 670 683 block_group_err(leaf, slot, 671 684 "invalid block group chunk objectid, have %llu expect %llu", 672 685 btrfs_stack_block_group_chunk_objectid(&bgi), ··· 1665 1648 /* These trees must never be empty */ 1666 1649 if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID || 1667 1650 owner == BTRFS_CHUNK_TREE_OBJECTID || 1668 - owner == BTRFS_EXTENT_TREE_OBJECTID || 1669 1651 owner == BTRFS_DEV_TREE_OBJECTID || 1670 1652 owner == BTRFS_FS_TREE_OBJECTID || 1671 1653 owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { ··· 1673 1657 owner); 1674 1658 return -EUCLEAN; 1675 1659 } 1660 + 1676 1661 /* Unknown tree */ 1677 1662 if (unlikely(owner == 0)) { 1678 1663 generic_err(leaf, 0, 1679 1664 "invalid owner, root 0 is not defined"); 1680 1665 return -EUCLEAN; 1681 1666 } 1667 + 1668 + /* EXTENT_TREE_V2 can have empty extent trees. */ 1669 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 1670 + return 0; 1671 + 1672 + if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { 1673 + generic_err(leaf, 0, 1674 + "invalid root, root %llu must never be empty", 1675 + owner); 1676 + return -EUCLEAN; 1677 + } 1678 + 1682 1679 return 0; 1683 1680 } 1684 1681

+613 -371

fs/btrfs/tree-log.c

··· 270 270 } 271 271 } 272 272 273 - static int btrfs_write_tree_block(struct extent_buffer *buf) 274 - { 275 - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 276 - buf->start + buf->len - 1); 277 - } 278 - 279 273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 280 274 { 281 275 filemap_fdatawait_range(buf->pages[0]->mapping, ··· 287 293 * at transaction commit time while freeing a log tree 288 294 */ 289 295 int free; 290 - 291 - /* should we write out the extent buffer? This is used 292 - * while flushing the log tree to disk during a sync 293 - */ 294 - int write; 295 - 296 - /* should we wait for the extent buffer io to finish? Also used 297 - * while flushing the log tree to disk for a sync 298 - */ 299 - int wait; 300 296 301 297 /* pin only walk, we record which extents on disk belong to the 302 298 * log trees ··· 338 354 return ret; 339 355 } 340 356 341 - if (wc->pin) 357 + if (wc->pin) { 342 358 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 343 359 eb->len); 360 + if (ret) 361 + return ret; 344 362 345 - if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 346 - if (wc->pin && btrfs_header_level(eb) == 0) 363 + if (btrfs_buffer_uptodate(eb, gen, 0) && 364 + btrfs_header_level(eb) == 0) 347 365 ret = btrfs_exclude_logged_extents(eb); 348 - if (wc->write) 349 - btrfs_write_tree_block(eb); 350 - if (wc->wait) 351 - btrfs_wait_tree_block_writeback(eb); 352 366 } 353 367 return ret; 354 368 } ··· 899 917 return ret; 900 918 } 901 919 920 + static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 921 + struct btrfs_inode *dir, 922 + struct btrfs_inode *inode, 923 + const char *name, 924 + int name_len) 925 + { 926 + int ret; 927 + 928 + ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); 929 + if (ret) 930 + return ret; 931 + /* 932 + * Whenever we need to check if a name exists or not, we check the 933 + * fs/subvolume tree. So after an unlink we must run delayed items, so 934 + * that future checks for a name during log replay see that the name 935 + * does not exists anymore. 936 + */ 937 + return btrfs_run_delayed_items(trans); 938 + } 939 + 902 940 /* 903 941 * when cleaning up conflicts between the directory names in the 904 942 * subvolume, directory names in the log and directory names in the ··· 961 959 if (ret) 962 960 goto out; 963 961 964 - ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, 962 + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, 965 963 name_len); 966 - if (ret) 967 - goto out; 968 - else 969 - ret = btrfs_run_delayed_items(trans); 970 964 out: 971 965 kfree(name); 972 966 iput(inode); ··· 1122 1124 inc_nlink(&inode->vfs_inode); 1123 1125 btrfs_release_path(path); 1124 1126 1125 - ret = btrfs_unlink_inode(trans, dir, inode, 1127 + ret = unlink_inode_for_log_replay(trans, dir, inode, 1126 1128 victim_name, victim_name_len); 1127 1129 kfree(victim_name); 1128 - if (ret) 1129 - return ret; 1130 - ret = btrfs_run_delayed_items(trans); 1131 1130 if (ret) 1132 1131 return ret; 1133 1132 *search_done = 1; ··· 1191 1196 inc_nlink(&inode->vfs_inode); 1192 1197 btrfs_release_path(path); 1193 1198 1194 - ret = btrfs_unlink_inode(trans, 1199 + ret = unlink_inode_for_log_replay(trans, 1195 1200 BTRFS_I(victim_parent), 1196 1201 inode, 1197 1202 victim_name, 1198 1203 victim_name_len); 1199 - if (!ret) 1200 - ret = btrfs_run_delayed_items( 1201 - trans); 1202 1204 } 1203 1205 iput(victim_parent); 1204 1206 kfree(victim_name); ··· 1350 1358 kfree(name); 1351 1359 goto out; 1352 1360 } 1353 - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), 1361 + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1354 1362 inode, name, namelen); 1355 1363 kfree(name); 1356 1364 iput(dir); 1357 - /* 1358 - * Whenever we need to check if a name exists or not, we 1359 - * check the subvolume tree. So after an unlink we must 1360 - * run delayed items, so that future checks for a name 1361 - * during log replay see that the name does not exists 1362 - * anymore. 1363 - */ 1364 - if (!ret) 1365 - ret = btrfs_run_delayed_items(trans); 1366 1365 if (ret) 1367 1366 goto out; 1368 1367 goto again; ··· 1449 1466 ret = -ENOENT; 1450 1467 goto out; 1451 1468 } 1452 - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1453 - name, namelen); 1469 + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1470 + name, namelen); 1454 1471 if (ret) 1455 1472 goto out; 1456 1473 /* ··· 1459 1476 */ 1460 1477 if (other_inode->i_nlink == 0) 1461 1478 inc_nlink(other_inode); 1462 - 1463 - ret = btrfs_run_delayed_items(trans); 1464 - if (ret) 1465 - goto out; 1466 1479 add_link: 1467 1480 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1468 1481 name, namelen, 0, ref_index); ··· 1591 1612 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1592 1613 name, namelen); 1593 1614 if (ret > 0) { 1594 - ret = btrfs_unlink_inode(trans, 1615 + ret = unlink_inode_for_log_replay(trans, 1595 1616 BTRFS_I(dir), 1596 1617 BTRFS_I(inode), 1597 1618 name, namelen); ··· 1602 1623 */ 1603 1624 if (!ret && inode->i_nlink == 0) 1604 1625 inc_nlink(inode); 1605 - /* 1606 - * Whenever we need to check if a name exists or 1607 - * not, we check the subvolume tree. So after an 1608 - * unlink we must run delayed items, so that future 1609 - * checks for a name during log replay see that the 1610 - * name does not exists anymore. 1611 - */ 1612 - if (!ret) 1613 - ret = btrfs_run_delayed_items(trans); 1614 1626 } 1615 1627 if (ret < 0) 1616 1628 goto out; ··· 2338 2368 goto out; 2339 2369 2340 2370 inc_nlink(inode); 2341 - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, 2342 - name_len); 2343 - if (ret) 2344 - goto out; 2345 - 2346 - ret = btrfs_run_delayed_items(trans); 2347 - if (ret) 2348 - goto out; 2349 - 2371 + ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2372 + name, name_len); 2350 2373 /* 2351 2374 * Unlike dir item keys, dir index keys can only have one name (entry) in 2352 2375 * them, as there are no key collisions since each key has a unique offset ··· 3458 3495 } 3459 3496 3460 3497 /* 3461 - * Check if an inode was logged in the current transaction. This may often 3462 - * return some false positives, because logged_trans is an in memory only field, 3463 - * not persisted anywhere. This is meant to be used in contexts where a false 3464 - * positive has no functional consequences. 3498 + * Check if an inode was logged in the current transaction. This correctly deals 3499 + * with the case where the inode was logged but has a logged_trans of 0, which 3500 + * happens if the inode is evicted and loaded again, as logged_trans is an in 3501 + * memory only field (not persisted). 3502 + * 3503 + * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3504 + * and < 0 on error. 3465 3505 */ 3466 - static bool inode_logged(struct btrfs_trans_handle *trans, 3467 - struct btrfs_inode *inode) 3506 + static int inode_logged(struct btrfs_trans_handle *trans, 3507 + struct btrfs_inode *inode, 3508 + struct btrfs_path *path_in) 3468 3509 { 3469 - if (inode->logged_trans == trans->transid) 3470 - return true; 3510 + struct btrfs_path *path = path_in; 3511 + struct btrfs_key key; 3512 + int ret; 3471 3513 3472 - if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) 3473 - return false; 3514 + if (inode->logged_trans == trans->transid) 3515 + return 1; 3474 3516 3475 3517 /* 3476 - * The inode's logged_trans is always 0 when we load it (because it is 3477 - * not persisted in the inode item or elsewhere). So if it is 0, the 3478 - * inode was last modified in the current transaction then the inode may 3479 - * have been logged before in the current transaction, then evicted and 3480 - * loaded again in the current transaction - or may have never been logged 3481 - * in the current transaction, but since we can not be sure, we have to 3482 - * assume it was, otherwise our callers can leave an inconsistent log. 3518 + * If logged_trans is not 0, then we know the inode logged was not logged 3519 + * in this transaction, so we can return false right away. 3483 3520 */ 3484 - if (inode->logged_trans == 0 && 3485 - inode->last_trans == trans->transid && 3486 - !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3487 - return true; 3521 + if (inode->logged_trans > 0) 3522 + return 0; 3488 3523 3489 - return false; 3524 + /* 3525 + * If no log tree was created for this root in this transaction, then 3526 + * the inode can not have been logged in this transaction. In that case 3527 + * set logged_trans to anything greater than 0 and less than the current 3528 + * transaction's ID, to avoid the search below in a future call in case 3529 + * a log tree gets created after this. 3530 + */ 3531 + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3532 + inode->logged_trans = trans->transid - 1; 3533 + return 0; 3534 + } 3535 + 3536 + /* 3537 + * We have a log tree and the inode's logged_trans is 0. We can't tell 3538 + * for sure if the inode was logged before in this transaction by looking 3539 + * only at logged_trans. We could be pessimistic and assume it was, but 3540 + * that can lead to unnecessarily logging an inode during rename and link 3541 + * operations, and then further updating the log in followup rename and 3542 + * link operations, specially if it's a directory, which adds latency 3543 + * visible to applications doing a series of rename or link operations. 3544 + * 3545 + * A logged_trans of 0 here can mean several things: 3546 + * 3547 + * 1) The inode was never logged since the filesystem was mounted, and may 3548 + * or may have not been evicted and loaded again; 3549 + * 3550 + * 2) The inode was logged in a previous transaction, then evicted and 3551 + * then loaded again; 3552 + * 3553 + * 3) The inode was logged in the current transaction, then evicted and 3554 + * then loaded again. 3555 + * 3556 + * For cases 1) and 2) we don't want to return true, but we need to detect 3557 + * case 3) and return true. So we do a search in the log root for the inode 3558 + * item. 3559 + */ 3560 + key.objectid = btrfs_ino(inode); 3561 + key.type = BTRFS_INODE_ITEM_KEY; 3562 + key.offset = 0; 3563 + 3564 + if (!path) { 3565 + path = btrfs_alloc_path(); 3566 + if (!path) 3567 + return -ENOMEM; 3568 + } 3569 + 3570 + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3571 + 3572 + if (path_in) 3573 + btrfs_release_path(path); 3574 + else 3575 + btrfs_free_path(path); 3576 + 3577 + /* 3578 + * Logging an inode always results in logging its inode item. So if we 3579 + * did not find the item we know the inode was not logged for sure. 3580 + */ 3581 + if (ret < 0) { 3582 + return ret; 3583 + } else if (ret > 0) { 3584 + /* 3585 + * Set logged_trans to a value greater than 0 and less then the 3586 + * current transaction to avoid doing the search in future calls. 3587 + */ 3588 + inode->logged_trans = trans->transid - 1; 3589 + return 0; 3590 + } 3591 + 3592 + /* 3593 + * The inode was previously logged and then evicted, set logged_trans to 3594 + * the current transacion's ID, to avoid future tree searches as long as 3595 + * the inode is not evicted again. 3596 + */ 3597 + inode->logged_trans = trans->transid; 3598 + 3599 + /* 3600 + * If it's a directory, then we must set last_dir_index_offset to the 3601 + * maximum possible value, so that the next attempt to log the inode does 3602 + * not skip checking if dir index keys found in modified subvolume tree 3603 + * leaves have been logged before, otherwise it would result in attempts 3604 + * to insert duplicate dir index keys in the log tree. This must be done 3605 + * because last_dir_index_offset is an in-memory only field, not persisted 3606 + * in the inode item or any other on-disk structure, so its value is lost 3607 + * once the inode is evicted. 3608 + */ 3609 + if (S_ISDIR(inode->vfs_inode.i_mode)) 3610 + inode->last_dir_index_offset = (u64)-1; 3611 + 3612 + return 1; 3613 + } 3614 + 3615 + /* 3616 + * Delete a directory entry from the log if it exists. 3617 + * 3618 + * Returns < 0 on error 3619 + * 1 if the entry does not exists 3620 + * 0 if the entry existed and was successfully deleted 3621 + */ 3622 + static int del_logged_dentry(struct btrfs_trans_handle *trans, 3623 + struct btrfs_root *log, 3624 + struct btrfs_path *path, 3625 + u64 dir_ino, 3626 + const char *name, int name_len, 3627 + u64 index) 3628 + { 3629 + struct btrfs_dir_item *di; 3630 + 3631 + /* 3632 + * We only log dir index items of a directory, so we don't need to look 3633 + * for dir item keys. 3634 + */ 3635 + di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3636 + index, name, name_len, -1); 3637 + if (IS_ERR(di)) 3638 + return PTR_ERR(di); 3639 + else if (!di) 3640 + return 1; 3641 + 3642 + /* 3643 + * We do not need to update the size field of the directory's 3644 + * inode item because on log replay we update the field to reflect 3645 + * all existing entries in the directory (see overwrite_item()). 3646 + */ 3647 + return btrfs_delete_one_dir_name(trans, log, path, di); 3490 3648 } 3491 3649 3492 3650 /* ··· 3636 3552 const char *name, int name_len, 3637 3553 struct btrfs_inode *dir, u64 index) 3638 3554 { 3639 - struct btrfs_root *log; 3640 - struct btrfs_dir_item *di; 3641 3555 struct btrfs_path *path; 3642 3556 int ret; 3643 - int err = 0; 3644 - u64 dir_ino = btrfs_ino(dir); 3645 3557 3646 - if (!inode_logged(trans, dir)) 3558 + ret = inode_logged(trans, dir, NULL); 3559 + if (ret == 0) 3647 3560 return; 3561 + else if (ret < 0) { 3562 + btrfs_set_log_full_commit(trans); 3563 + return; 3564 + } 3648 3565 3649 3566 ret = join_running_log_trans(root); 3650 3567 if (ret) ··· 3653 3568 3654 3569 mutex_lock(&dir->log_mutex); 3655 3570 3656 - log = root->log_root; 3657 3571 path = btrfs_alloc_path(); 3658 3572 if (!path) { 3659 - err = -ENOMEM; 3573 + ret = -ENOMEM; 3660 3574 goto out_unlock; 3661 3575 } 3662 3576 3663 - /* 3664 - * We only log dir index items of a directory, so we don't need to look 3665 - * for dir item keys. 3666 - */ 3667 - di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3668 - index, name, name_len, -1); 3669 - if (IS_ERR(di)) { 3670 - err = PTR_ERR(di); 3671 - goto fail; 3672 - } 3673 - if (di) { 3674 - ret = btrfs_delete_one_dir_name(trans, log, path, di); 3675 - if (ret) { 3676 - err = ret; 3677 - goto fail; 3678 - } 3679 - } 3680 - 3681 - /* 3682 - * We do not need to update the size field of the directory's inode item 3683 - * because on log replay we update the field to reflect all existing 3684 - * entries in the directory (see overwrite_item()). 3685 - */ 3686 - fail: 3577 + ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3578 + name, name_len, index); 3687 3579 btrfs_free_path(path); 3688 3580 out_unlock: 3689 3581 mutex_unlock(&dir->log_mutex); 3690 - if (err < 0) 3582 + if (ret < 0) 3691 3583 btrfs_set_log_full_commit(trans); 3692 3584 btrfs_end_log_trans(root); 3693 3585 } ··· 3679 3617 u64 index; 3680 3618 int ret; 3681 3619 3682 - if (!inode_logged(trans, inode)) 3620 + ret = inode_logged(trans, inode, NULL); 3621 + if (ret == 0) 3683 3622 return; 3623 + else if (ret < 0) { 3624 + btrfs_set_log_full_commit(trans); 3625 + return; 3626 + } 3684 3627 3685 3628 ret = join_running_log_trans(root); 3686 3629 if (ret) ··· 3810 3743 struct btrfs_inode *inode, 3811 3744 struct btrfs_path *path, 3812 3745 struct btrfs_path *dst_path, 3813 - struct btrfs_log_ctx *ctx) 3746 + struct btrfs_log_ctx *ctx, 3747 + u64 *last_old_dentry_offset) 3814 3748 { 3815 3749 struct btrfs_root *log = inode->root->log_root; 3816 3750 struct extent_buffer *src = path->nodes[0]; 3817 3751 const int nritems = btrfs_header_nritems(src); 3818 3752 const u64 ino = btrfs_ino(inode); 3819 - const bool inode_logged_before = inode_logged(trans, inode); 3820 3753 bool last_found = false; 3821 3754 int batch_start = 0; 3822 3755 int batch_size = 0; 3823 3756 int i; 3824 3757 3825 3758 for (i = path->slots[0]; i < nritems; i++) { 3759 + struct btrfs_dir_item *di; 3826 3760 struct btrfs_key key; 3827 3761 int ret; 3828 3762 ··· 3834 3766 break; 3835 3767 } 3836 3768 3769 + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3837 3770 ctx->last_dir_item_offset = key.offset; 3771 + 3772 + /* 3773 + * Skip ranges of items that consist only of dir item keys created 3774 + * in past transactions. However if we find a gap, we must log a 3775 + * dir index range item for that gap, so that index keys in that 3776 + * gap are deleted during log replay. 3777 + */ 3778 + if (btrfs_dir_transid(src, di) < trans->transid) { 3779 + if (key.offset > *last_old_dentry_offset + 1) { 3780 + ret = insert_dir_log_key(trans, log, dst_path, 3781 + ino, *last_old_dentry_offset + 1, 3782 + key.offset - 1); 3783 + /* 3784 + * -EEXIST should never happen because when we 3785 + * log a directory in full mode (LOG_INODE_ALL) 3786 + * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from 3787 + * the log tree. 3788 + */ 3789 + ASSERT(ret != -EEXIST); 3790 + if (ret < 0) 3791 + return ret; 3792 + } 3793 + 3794 + *last_old_dentry_offset = key.offset; 3795 + continue; 3796 + } 3838 3797 /* 3839 3798 * We must make sure that when we log a directory entry, the 3840 3799 * corresponding inode, after log replay, has a matching link ··· 3885 3790 * resulting in -ENOTEMPTY errors. 3886 3791 */ 3887 3792 if (!ctx->log_new_dentries) { 3888 - struct btrfs_dir_item *di; 3889 3793 struct btrfs_key di_key; 3890 3794 3891 - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3892 3795 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3893 - if ((btrfs_dir_transid(src, di) == trans->transid || 3894 - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3895 - di_key.type != BTRFS_ROOT_ITEM_KEY) 3796 + if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3896 3797 ctx->log_new_dentries = true; 3897 3798 } 3898 3799 3899 - if (!inode_logged_before) 3800 + if (!ctx->logged_before) 3900 3801 goto add_to_batch; 3901 3802 3902 3803 /* 3903 3804 * If we were logged before and have logged dir items, we can skip 3904 3805 * checking if any item with a key offset larger than the last one 3905 3806 * we logged is in the log tree, saving time and avoiding adding 3906 - * contention on the log tree. 3807 + * contention on the log tree. We can only rely on the value of 3808 + * last_dir_index_offset when we know for sure that the inode was 3809 + * previously logged in the current transaction. 3907 3810 */ 3908 3811 if (key.offset > inode->last_dir_index_offset) 3909 3812 goto add_to_batch; ··· 3971 3878 struct btrfs_root *log = root->log_root; 3972 3879 int err = 0; 3973 3880 int ret; 3974 - u64 first_offset = min_offset; 3881 + u64 last_old_dentry_offset = min_offset - 1; 3975 3882 u64 last_offset = (u64)-1; 3976 3883 u64 ino = btrfs_ino(inode); 3977 3884 ··· 4005 3912 */ 4006 3913 if (ret == 0) { 4007 3914 struct btrfs_key tmp; 3915 + 4008 3916 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 4009 3917 path->slots[0]); 4010 3918 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4011 - first_offset = max(min_offset, tmp.offset) + 1; 3919 + last_old_dentry_offset = tmp.offset; 4012 3920 } 4013 3921 goto done; 4014 3922 } ··· 4018 3924 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4019 3925 if (ret == 0) { 4020 3926 struct btrfs_key tmp; 3927 + 4021 3928 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 4022 - if (tmp.type == BTRFS_DIR_INDEX_KEY) { 4023 - first_offset = tmp.offset; 4024 - ret = overwrite_item(trans, log, dst_path, 4025 - path->nodes[0], path->slots[0], 4026 - &tmp); 4027 - if (ret) { 4028 - err = ret; 4029 - goto done; 4030 - } 4031 - } 3929 + /* 3930 + * The dir index key before the first one we found that needs to 3931 + * be logged might be in a previous leaf, and there might be a 3932 + * gap between these keys, meaning that we had deletions that 3933 + * happened. So the key range item we log (key type 3934 + * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 3935 + * previous key's offset plus 1, so that those deletes are replayed. 3936 + */ 3937 + if (tmp.type == BTRFS_DIR_INDEX_KEY) 3938 + last_old_dentry_offset = tmp.offset; 4032 3939 } 4033 3940 btrfs_release_path(path); 4034 3941 ··· 4051 3956 * from our directory 4052 3957 */ 4053 3958 while (1) { 4054 - ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx); 3959 + ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 3960 + &last_old_dentry_offset); 4055 3961 if (ret != 0) { 4056 3962 if (ret < 0) 4057 3963 err = ret; ··· 4078 3982 goto done; 4079 3983 } 4080 3984 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 4081 - ctx->last_dir_item_offset = min_key.offset; 4082 - ret = overwrite_item(trans, log, dst_path, 4083 - path->nodes[0], path->slots[0], 4084 - &min_key); 4085 - if (ret) 4086 - err = ret; 4087 - else 4088 - last_offset = min_key.offset; 3985 + /* 3986 + * The next leaf was not changed in the current transaction 3987 + * and has at least one dir index key. 3988 + * We check for the next key because there might have been 3989 + * one or more deletions between the last key we logged and 3990 + * that next key. So the key range item we log (key type 3991 + * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 3992 + * offset minus 1, so that those deletes are replayed. 3993 + */ 3994 + last_offset = min_key.offset - 1; 4089 3995 goto done; 4090 3996 } 4091 3997 if (need_resched()) { ··· 4103 4005 if (err == 0) { 4104 4006 *last_offset_ret = last_offset; 4105 4007 /* 4106 - * insert the log range keys to indicate where the log 4107 - * is valid 4008 + * In case the leaf was changed in the current transaction but 4009 + * all its dir items are from a past transaction, the last item 4010 + * in the leaf is a dir item and there's no gap between that last 4011 + * dir item and the first one on the next leaf (which did not 4012 + * change in the current transaction), then we don't need to log 4013 + * a range, last_old_dentry_offset is == to last_offset. 4108 4014 */ 4109 - ret = insert_dir_log_key(trans, log, path, ino, first_offset, 4110 - last_offset); 4111 - if (ret) 4112 - err = ret; 4015 + ASSERT(last_old_dentry_offset <= last_offset); 4016 + if (last_old_dentry_offset < last_offset) { 4017 + ret = insert_dir_log_key(trans, log, path, ino, 4018 + last_old_dentry_offset + 1, 4019 + last_offset); 4020 + if (ret) 4021 + err = ret; 4022 + } 4113 4023 } 4114 4024 return err; 4115 4025 } ··· 4144 4038 u64 max_key; 4145 4039 int ret; 4146 4040 4147 - /* 4148 - * If this is the first time we are being logged in the current 4149 - * transaction, or we were logged before but the inode was evicted and 4150 - * reloaded later, in which case its logged_trans is 0, reset the value 4151 - * of the last logged key offset. Note that we don't use the helper 4152 - * function inode_logged() here - that is because the function returns 4153 - * true after an inode eviction, assuming the worst case as it can not 4154 - * know for sure if the inode was logged before. So we can not skip key 4155 - * searches in the case the inode was evicted, because it may not have 4156 - * been logged in this transaction and may have been logged in a past 4157 - * transaction, so we need to reset the last dir index offset to (u64)-1. 4158 - */ 4159 - if (inode->logged_trans != trans->transid) 4160 - inode->last_dir_index_offset = (u64)-1; 4161 - 4162 - min_key = 0; 4041 + min_key = BTRFS_DIR_START_INDEX; 4163 4042 max_key = 0; 4164 4043 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4165 4044 ··· 4179 4088 struct btrfs_key key; 4180 4089 struct btrfs_key found_key; 4181 4090 int start_slot; 4182 - 4183 - if (!inode_logged(trans, inode)) 4184 - return 0; 4185 4091 4186 4092 key.objectid = btrfs_ino(inode); 4187 4093 key.type = max_key_type; ··· 4399 4311 int start_slot, int nr, int inode_only, 4400 4312 u64 logged_isize) 4401 4313 { 4402 - struct btrfs_fs_info *fs_info = trans->fs_info; 4403 - unsigned long src_offset; 4404 - unsigned long dst_offset; 4405 4314 struct btrfs_root *log = inode->root->log_root; 4406 4315 struct btrfs_file_extent_item *extent; 4407 - struct btrfs_inode_item *inode_item; 4408 4316 struct extent_buffer *src = src_path->nodes[0]; 4409 - int ret; 4317 + int ret = 0; 4410 4318 struct btrfs_key *ins_keys; 4411 4319 u32 *ins_sizes; 4412 4320 struct btrfs_item_batch batch; 4413 4321 char *ins_data; 4414 4322 int i; 4415 - struct list_head ordered_sums; 4416 - int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 4417 - 4418 - INIT_LIST_HEAD(&ordered_sums); 4323 + int dst_index; 4324 + const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4325 + const u64 i_size = i_size_read(&inode->vfs_inode); 4419 4326 4420 4327 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4421 4328 nr * sizeof(u32), GFP_NOFS); ··· 4422 4339 batch.keys = ins_keys; 4423 4340 batch.data_sizes = ins_sizes; 4424 4341 batch.total_data_size = 0; 4425 - batch.nr = nr; 4342 + batch.nr = 0; 4426 4343 4344 + dst_index = 0; 4427 4345 for (i = 0; i < nr; i++) { 4428 - ins_sizes[i] = btrfs_item_size(src, i + start_slot); 4429 - batch.total_data_size += ins_sizes[i]; 4430 - btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 4346 + const int src_slot = start_slot + i; 4347 + struct btrfs_root *csum_root; 4348 + struct btrfs_ordered_sum *sums; 4349 + struct btrfs_ordered_sum *sums_next; 4350 + LIST_HEAD(ordered_sums); 4351 + u64 disk_bytenr; 4352 + u64 disk_num_bytes; 4353 + u64 extent_offset; 4354 + u64 extent_num_bytes; 4355 + bool is_old_extent; 4356 + 4357 + btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4358 + 4359 + if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4360 + goto add_to_batch; 4361 + 4362 + extent = btrfs_item_ptr(src, src_slot, 4363 + struct btrfs_file_extent_item); 4364 + 4365 + is_old_extent = (btrfs_file_extent_generation(src, extent) < 4366 + trans->transid); 4367 + 4368 + /* 4369 + * Don't copy extents from past generations. That would make us 4370 + * log a lot more metadata for common cases like doing only a 4371 + * few random writes into a file and then fsync it for the first 4372 + * time or after the full sync flag is set on the inode. We can 4373 + * get leaves full of extent items, most of which are from past 4374 + * generations, so we can skip them - as long as the inode has 4375 + * not been the target of a reflink operation in this transaction, 4376 + * as in that case it might have had file extent items with old 4377 + * generations copied into it. We also must always log prealloc 4378 + * extents that start at or beyond eof, otherwise we would lose 4379 + * them on log replay. 4380 + */ 4381 + if (is_old_extent && 4382 + ins_keys[dst_index].offset < i_size && 4383 + inode->last_reflink_trans < trans->transid) 4384 + continue; 4385 + 4386 + if (skip_csum) 4387 + goto add_to_batch; 4388 + 4389 + /* Only regular extents have checksums. */ 4390 + if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4391 + goto add_to_batch; 4392 + 4393 + /* 4394 + * If it's an extent created in a past transaction, then its 4395 + * checksums are already accessible from the committed csum tree, 4396 + * no need to log them. 4397 + */ 4398 + if (is_old_extent) 4399 + goto add_to_batch; 4400 + 4401 + disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4402 + /* If it's an explicit hole, there are no checksums. */ 4403 + if (disk_bytenr == 0) 4404 + goto add_to_batch; 4405 + 4406 + disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4407 + 4408 + if (btrfs_file_extent_compression(src, extent)) { 4409 + extent_offset = 0; 4410 + extent_num_bytes = disk_num_bytes; 4411 + } else { 4412 + extent_offset = btrfs_file_extent_offset(src, extent); 4413 + extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4414 + } 4415 + 4416 + csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4417 + disk_bytenr += extent_offset; 4418 + ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4419 + disk_bytenr + extent_num_bytes - 1, 4420 + &ordered_sums, 0); 4421 + if (ret) 4422 + goto out; 4423 + 4424 + list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4425 + if (!ret) 4426 + ret = log_csums(trans, inode, log, sums); 4427 + list_del(&sums->list); 4428 + kfree(sums); 4429 + } 4430 + if (ret) 4431 + goto out; 4432 + 4433 + add_to_batch: 4434 + ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4435 + batch.total_data_size += ins_sizes[dst_index]; 4436 + batch.nr++; 4437 + dst_index++; 4431 4438 } 4439 + 4440 + /* 4441 + * We have a leaf full of old extent items that don't need to be logged, 4442 + * so we don't need to do anything. 4443 + */ 4444 + if (batch.nr == 0) 4445 + goto out; 4446 + 4432 4447 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4433 - if (ret) { 4434 - kfree(ins_data); 4435 - return ret; 4436 - } 4448 + if (ret) 4449 + goto out; 4437 4450 4438 - for (i = 0; i < nr; i++, dst_path->slots[0]++) { 4439 - dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 4440 - dst_path->slots[0]); 4451 + dst_index = 0; 4452 + for (i = 0; i < nr; i++) { 4453 + const int src_slot = start_slot + i; 4454 + const int dst_slot = dst_path->slots[0] + dst_index; 4455 + struct btrfs_key key; 4456 + unsigned long src_offset; 4457 + unsigned long dst_offset; 4441 4458 4442 - src_offset = btrfs_item_ptr_offset(src, start_slot + i); 4459 + /* 4460 + * We're done, all the remaining items in the source leaf 4461 + * correspond to old file extent items. 4462 + */ 4463 + if (dst_index >= batch.nr) 4464 + break; 4443 4465 4444 - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 4445 - inode_item = btrfs_item_ptr(dst_path->nodes[0], 4446 - dst_path->slots[0], 4466 + btrfs_item_key_to_cpu(src, &key, src_slot); 4467 + 4468 + if (key.type != BTRFS_EXTENT_DATA_KEY) 4469 + goto copy_item; 4470 + 4471 + extent = btrfs_item_ptr(src, src_slot, 4472 + struct btrfs_file_extent_item); 4473 + 4474 + /* See the comment in the previous loop, same logic. */ 4475 + if (btrfs_file_extent_generation(src, extent) < trans->transid && 4476 + key.offset < i_size && 4477 + inode->last_reflink_trans < trans->transid) 4478 + continue; 4479 + 4480 + copy_item: 4481 + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4482 + src_offset = btrfs_item_ptr_offset(src, src_slot); 4483 + 4484 + if (key.type == BTRFS_INODE_ITEM_KEY) { 4485 + struct btrfs_inode_item *inode_item; 4486 + 4487 + inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4447 4488 struct btrfs_inode_item); 4448 4489 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4449 4490 &inode->vfs_inode, ··· 4575 4368 logged_isize); 4576 4369 } else { 4577 4370 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4578 - src_offset, ins_sizes[i]); 4371 + src_offset, ins_sizes[dst_index]); 4579 4372 } 4580 4373 4581 - /* take a reference on file data extents so that truncates 4582 - * or deletes of this inode don't have to relog the inode 4583 - * again 4584 - */ 4585 - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4586 - !skip_csum) { 4587 - int found_type; 4588 - extent = btrfs_item_ptr(src, start_slot + i, 4589 - struct btrfs_file_extent_item); 4590 - 4591 - if (btrfs_file_extent_generation(src, extent) < trans->transid) 4592 - continue; 4593 - 4594 - found_type = btrfs_file_extent_type(src, extent); 4595 - if (found_type == BTRFS_FILE_EXTENT_REG) { 4596 - struct btrfs_root *csum_root; 4597 - u64 ds, dl, cs, cl; 4598 - ds = btrfs_file_extent_disk_bytenr(src, 4599 - extent); 4600 - /* ds == 0 is a hole */ 4601 - if (ds == 0) 4602 - continue; 4603 - 4604 - dl = btrfs_file_extent_disk_num_bytes(src, 4605 - extent); 4606 - cs = btrfs_file_extent_offset(src, extent); 4607 - cl = btrfs_file_extent_num_bytes(src, 4608 - extent); 4609 - if (btrfs_file_extent_compression(src, 4610 - extent)) { 4611 - cs = 0; 4612 - cl = dl; 4613 - } 4614 - 4615 - csum_root = btrfs_csum_root(fs_info, ds); 4616 - ret = btrfs_lookup_csums_range(csum_root, 4617 - ds + cs, ds + cs + cl - 1, 4618 - &ordered_sums, 0); 4619 - if (ret) 4620 - break; 4621 - } 4622 - } 4374 + dst_index++; 4623 4375 } 4624 4376 4625 4377 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4626 4378 btrfs_release_path(dst_path); 4379 + out: 4627 4380 kfree(ins_data); 4628 - 4629 - /* 4630 - * we have to do this after the loop above to avoid changing the 4631 - * log tree while trying to change the log tree. 4632 - */ 4633 - while (!list_empty(&ordered_sums)) { 4634 - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4635 - struct btrfs_ordered_sum, 4636 - list); 4637 - if (!ret) 4638 - ret = log_csums(trans, inode, log, sums); 4639 - list_del(&sums->list); 4640 - kfree(sums); 4641 - } 4642 4381 4643 4382 return ret; 4644 4383 } ··· 4721 4568 { 4722 4569 struct btrfs_drop_extents_args drop_args = { 0 }; 4723 4570 struct btrfs_root *log = inode->root->log_root; 4724 - struct btrfs_file_extent_item *fi; 4571 + struct btrfs_file_extent_item fi = { 0 }; 4725 4572 struct extent_buffer *leaf; 4726 - struct btrfs_map_token token; 4727 4573 struct btrfs_key key; 4728 4574 u64 extent_offset = em->start - em->orig_start; 4729 4575 u64 block_len; 4730 4576 int ret; 4577 + 4578 + btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4579 + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4580 + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4581 + else 4582 + btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4583 + 4584 + block_len = max(em->block_len, em->orig_block_len); 4585 + if (em->compress_type != BTRFS_COMPRESS_NONE) { 4586 + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4587 + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4588 + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4589 + btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4590 + extent_offset); 4591 + btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4592 + } 4593 + 4594 + btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4595 + btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4596 + btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4597 + btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4731 4598 4732 4599 ret = log_extent_csums(trans, inode, log, em, ctx); 4733 4600 if (ret) ··· 4762 4589 * are small, with a root at level 2 or 3 at most, due to their short 4763 4590 * life span. 4764 4591 */ 4765 - if (inode_logged(trans, inode)) { 4592 + if (ctx->logged_before) { 4766 4593 drop_args.path = path; 4767 4594 drop_args.start = em->start; 4768 4595 drop_args.end = em->start + em->len; 4769 4596 drop_args.replace_extent = true; 4770 - drop_args.extent_item_size = sizeof(*fi); 4597 + drop_args.extent_item_size = sizeof(fi); 4771 4598 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4772 4599 if (ret) 4773 4600 return ret; ··· 4779 4606 key.offset = em->start; 4780 4607 4781 4608 ret = btrfs_insert_empty_item(trans, log, path, &key, 4782 - sizeof(*fi)); 4609 + sizeof(fi)); 4783 4610 if (ret) 4784 4611 return ret; 4785 4612 } 4786 4613 leaf = path->nodes[0]; 4787 - btrfs_init_map_token(&token, leaf); 4788 - fi = btrfs_item_ptr(leaf, path->slots[0], 4789 - struct btrfs_file_extent_item); 4790 - 4791 - btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4792 - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4793 - btrfs_set_token_file_extent_type(&token, fi, 4794 - BTRFS_FILE_EXTENT_PREALLOC); 4795 - else 4796 - btrfs_set_token_file_extent_type(&token, fi, 4797 - BTRFS_FILE_EXTENT_REG); 4798 - 4799 - block_len = max(em->block_len, em->orig_block_len); 4800 - if (em->compress_type != BTRFS_COMPRESS_NONE) { 4801 - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4802 - em->block_start); 4803 - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4804 - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4805 - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4806 - em->block_start - 4807 - extent_offset); 4808 - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4809 - } else { 4810 - btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4811 - btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4812 - } 4813 - 4814 - btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4815 - btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4816 - btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4817 - btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4818 - btrfs_set_token_file_extent_encryption(&token, fi, 0); 4819 - btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4614 + write_extent_buffer(leaf, &fi, 4615 + btrfs_item_ptr_offset(leaf, path->slots[0]), 4616 + sizeof(fi)); 4820 4617 btrfs_mark_buffer_dirty(leaf); 4821 4618 4822 4619 btrfs_release_path(path); ··· 5000 4857 WARN_ON(!list_empty(&extents)); 5001 4858 write_unlock(&tree->lock); 5002 4859 5003 - btrfs_release_path(path); 5004 4860 if (!ret) 5005 4861 ret = btrfs_log_prealloc_extents(trans, inode, path); 5006 4862 if (ret) ··· 5693 5551 } else { 5694 5552 break; 5695 5553 } 5554 + 5555 + /* 5556 + * We may process many leaves full of items for our inode, so 5557 + * avoid monopolizing a cpu for too long by rescheduling while 5558 + * not holding locks on any tree. 5559 + */ 5560 + cond_resched(); 5696 5561 } 5697 5562 if (ins_nr) { 5698 5563 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ··· 5744 5595 struct btrfs_key min_key; 5745 5596 struct btrfs_key max_key; 5746 5597 struct btrfs_root *log = inode->root->log_root; 5747 - int err = 0; 5748 - int ret = 0; 5598 + int ret; 5749 5599 bool fast_search = false; 5750 5600 u64 ino = btrfs_ino(inode); 5751 5601 struct extent_map_tree *em_tree = &inode->extent_tree; ··· 5753 5605 bool xattrs_logged = false; 5754 5606 bool recursive_logging = false; 5755 5607 bool inode_item_dropped = true; 5608 + const bool orig_logged_before = ctx->logged_before; 5756 5609 5757 5610 path = btrfs_alloc_path(); 5758 5611 if (!path) ··· 5787 5638 * and figure out which index ranges have to be logged. 5788 5639 */ 5789 5640 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5790 - err = btrfs_commit_inode_delayed_items(trans, inode); 5791 - if (err) 5641 + ret = btrfs_commit_inode_delayed_items(trans, inode); 5642 + if (ret) 5792 5643 goto out; 5793 5644 } 5794 5645 ··· 5804 5655 } 5805 5656 5806 5657 /* 5658 + * Before logging the inode item, cache the value returned by 5659 + * inode_logged(), because after that we have the need to figure out if 5660 + * the inode was previously logged in this transaction. 5661 + */ 5662 + ret = inode_logged(trans, inode, path); 5663 + if (ret < 0) 5664 + goto out_unlock; 5665 + ctx->logged_before = (ret == 1); 5666 + ret = 0; 5667 + 5668 + /* 5807 5669 * This is for cases where logging a directory could result in losing a 5808 5670 * a file after replaying the log. For example, if we move a file from a 5809 5671 * directory A to a directory B, then fsync directory A, we have no way ··· 5825 5665 inode_only == LOG_INODE_ALL && 5826 5666 inode->last_unlink_trans >= trans->transid) { 5827 5667 btrfs_set_log_full_commit(trans); 5828 - err = 1; 5668 + ret = 1; 5829 5669 goto out_unlock; 5830 5670 } 5831 5671 ··· 5839 5679 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5840 5680 if (inode_only == LOG_INODE_EXISTS) 5841 5681 max_key_type = BTRFS_XATTR_ITEM_KEY; 5842 - ret = drop_inode_items(trans, log, path, inode, max_key_type); 5682 + if (ctx->logged_before) 5683 + ret = drop_inode_items(trans, log, path, inode, 5684 + max_key_type); 5843 5685 } else { 5844 - if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { 5686 + if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 5845 5687 /* 5846 5688 * Make sure the new inode item we write to the log has 5847 5689 * the same isize as the current one (if it exists). ··· 5857 5695 * (zeroes), as if an expanding truncate happened, 5858 5696 * instead of getting a file of 4Kb only. 5859 5697 */ 5860 - err = logged_inode_size(log, inode, path, &logged_isize); 5861 - if (err) 5698 + ret = logged_inode_size(log, inode, path, &logged_isize); 5699 + if (ret) 5862 5700 goto out_unlock; 5863 5701 } 5864 5702 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5865 5703 &inode->runtime_flags)) { 5866 5704 if (inode_only == LOG_INODE_EXISTS) { 5867 5705 max_key.type = BTRFS_XATTR_ITEM_KEY; 5868 - ret = drop_inode_items(trans, log, path, inode, 5869 - max_key.type); 5706 + if (ctx->logged_before) 5707 + ret = drop_inode_items(trans, log, path, 5708 + inode, max_key.type); 5870 5709 } else { 5871 5710 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5872 5711 &inode->runtime_flags); 5873 5712 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5874 5713 &inode->runtime_flags); 5875 - if (inode_logged(trans, inode)) 5714 + if (ctx->logged_before) 5876 5715 ret = truncate_inode_items(trans, log, 5877 5716 inode, 0, 0); 5878 5717 } ··· 5883 5720 if (inode_only == LOG_INODE_ALL) 5884 5721 fast_search = true; 5885 5722 max_key.type = BTRFS_XATTR_ITEM_KEY; 5886 - ret = drop_inode_items(trans, log, path, inode, 5887 - max_key.type); 5723 + if (ctx->logged_before) 5724 + ret = drop_inode_items(trans, log, path, inode, 5725 + max_key.type); 5888 5726 } else { 5889 5727 if (inode_only == LOG_INODE_ALL) 5890 5728 fast_search = true; ··· 5894 5730 } 5895 5731 5896 5732 } 5897 - if (ret) { 5898 - err = ret; 5733 + if (ret) 5899 5734 goto out_unlock; 5900 - } 5901 5735 5902 - err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5736 + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5903 5737 path, dst_path, logged_isize, 5904 5738 recursive_logging, inode_only, ctx, 5905 5739 &need_log_inode_item); 5906 - if (err) 5740 + if (ret) 5907 5741 goto out_unlock; 5908 5742 5909 5743 btrfs_release_path(path); 5910 5744 btrfs_release_path(dst_path); 5911 - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5912 - if (err) 5745 + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5746 + if (ret) 5913 5747 goto out_unlock; 5914 5748 xattrs_logged = true; 5915 5749 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5916 5750 btrfs_release_path(path); 5917 5751 btrfs_release_path(dst_path); 5918 - err = btrfs_log_holes(trans, inode, path); 5919 - if (err) 5752 + ret = btrfs_log_holes(trans, inode, path); 5753 + if (ret) 5920 5754 goto out_unlock; 5921 5755 } 5922 5756 log_extents: 5923 5757 btrfs_release_path(path); 5924 5758 btrfs_release_path(dst_path); 5925 5759 if (need_log_inode_item) { 5926 - err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5927 - if (err) 5760 + ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5761 + if (ret) 5928 5762 goto out_unlock; 5929 5763 /* 5930 5764 * If we are doing a fast fsync and the inode was logged before ··· 5933 5771 * BTRFS_INODE_COPY_EVERYTHING set. 5934 5772 */ 5935 5773 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5936 - err = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5937 - if (err) 5774 + ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5775 + if (ret) 5938 5776 goto out_unlock; 5939 5777 btrfs_release_path(path); 5940 5778 } 5941 5779 } 5942 5780 if (fast_search) { 5943 5781 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 5944 - if (ret) { 5945 - err = ret; 5782 + if (ret) 5946 5783 goto out_unlock; 5947 - } 5948 5784 } else if (inode_only == LOG_INODE_ALL) { 5949 5785 struct extent_map *em, *n; 5950 5786 ··· 5954 5794 5955 5795 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5956 5796 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 5957 - if (ret) { 5958 - err = ret; 5797 + if (ret) 5959 5798 goto out_unlock; 5960 - } 5961 5799 } 5962 5800 5963 5801 spin_lock(&inode->lock); ··· 5994 5836 if (inode_only != LOG_INODE_EXISTS) 5995 5837 inode->last_log_commit = inode->last_sub_trans; 5996 5838 spin_unlock(&inode->lock); 5839 + 5840 + /* 5841 + * Reset the last_reflink_trans so that the next fsync does not need to 5842 + * go through the slower path when logging extents and their checksums. 5843 + */ 5844 + if (inode_only == LOG_INODE_ALL) 5845 + inode->last_reflink_trans = 0; 5846 + 5997 5847 out_unlock: 5998 5848 mutex_unlock(&inode->log_mutex); 5999 5849 out: 6000 5850 btrfs_free_path(path); 6001 5851 btrfs_free_path(dst_path); 6002 - return err; 5852 + 5853 + if (recursive_logging) 5854 + ctx->logged_before = orig_logged_before; 5855 + 5856 + return ret; 6003 5857 } 6004 5858 6005 5859 /* ··· 6096 5926 struct btrfs_log_ctx *ctx) 6097 5927 { 6098 5928 struct btrfs_fs_info *fs_info = root->fs_info; 6099 - struct btrfs_root *log = root->log_root; 6100 5929 struct btrfs_path *path; 6101 5930 LIST_HEAD(dir_list); 6102 5931 struct btrfs_dir_list *dir_elem; ··· 6137 5968 min_key.offset = 0; 6138 5969 again: 6139 5970 btrfs_release_path(path); 6140 - ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5971 + ret = btrfs_search_forward(root, &min_key, path, trans->transid); 6141 5972 if (ret < 0) { 6142 5973 goto next_dir_inode; 6143 5974 } else if (ret > 0) { ··· 6145 5976 goto next_dir_inode; 6146 5977 } 6147 5978 6148 - process_leaf: 6149 5979 leaf = path->nodes[0]; 6150 5980 nritems = btrfs_header_nritems(leaf); 6151 5981 for (i = path->slots[0]; i < nritems; i++) { ··· 6162 5994 6163 5995 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 6164 5996 type = btrfs_dir_type(leaf, di); 6165 - if (btrfs_dir_transid(leaf, di) < trans->transid && 6166 - type != BTRFS_FT_DIR) 5997 + if (btrfs_dir_transid(leaf, di) < trans->transid) 6167 5998 continue; 6168 5999 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 6169 6000 if (di_key.type == BTRFS_ROOT_ITEM_KEY) ··· 6199 6032 list_add_tail(&new_dir_elem->list, &dir_list); 6200 6033 } 6201 6034 break; 6202 - } 6203 - if (i == nritems) { 6204 - ret = btrfs_next_leaf(log, path); 6205 - if (ret < 0) { 6206 - goto next_dir_inode; 6207 - } else if (ret > 0) { 6208 - ret = 0; 6209 - goto next_dir_inode; 6210 - } 6211 - goto process_leaf; 6212 6035 } 6213 6036 if (min_key.offset < (u64)-1) { 6214 6037 min_key.offset++; ··· 6930 6773 mutex_unlock(&dir->log_mutex); 6931 6774 } 6932 6775 6933 - /* 6934 - * Call this after adding a new name for a file and it will properly 6935 - * update the log to reflect the new name. 6776 + /** 6777 + * Update the log after adding a new name for an inode. 6778 + * 6779 + * @trans: Transaction handle. 6780 + * @old_dentry: The dentry associated with the old name and the old 6781 + * parent directory. 6782 + * @old_dir: The inode of the previous parent directory for the case 6783 + * of a rename. For a link operation, it must be NULL. 6784 + * @old_dir_index: The index number associated with the old name, meaningful 6785 + * only for rename operations (when @old_dir is not NULL). 6786 + * Ignored for link operations. 6787 + * @parent: The dentry associated with the directory under which the 6788 + * new name is located. 6789 + * 6790 + * Call this after adding a new name for an inode, as a result of a link or 6791 + * rename operation, and it will properly update the log to reflect the new name. 6936 6792 */ 6937 6793 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6938 - struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6939 - struct dentry *parent) 6794 + struct dentry *old_dentry, struct btrfs_inode *old_dir, 6795 + u64 old_dir_index, struct dentry *parent) 6940 6796 { 6797 + struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 6798 + struct btrfs_root *root = inode->root; 6941 6799 struct btrfs_log_ctx ctx; 6800 + bool log_pinned = false; 6801 + int ret; 6942 6802 6943 6803 /* 6944 6804 * this will force the logging code to walk the dentry chain ··· 6968 6794 * if this inode hasn't been logged and directory we're renaming it 6969 6795 * from hasn't been logged, we don't need to log it 6970 6796 */ 6971 - if (!inode_logged(trans, inode) && 6972 - (!old_dir || !inode_logged(trans, old_dir))) 6973 - return; 6797 + ret = inode_logged(trans, inode, NULL); 6798 + if (ret < 0) { 6799 + goto out; 6800 + } else if (ret == 0) { 6801 + if (!old_dir) 6802 + return; 6803 + /* 6804 + * If the inode was not logged and we are doing a rename (old_dir is not 6805 + * NULL), check if old_dir was logged - if it was not we can return and 6806 + * do nothing. 6807 + */ 6808 + ret = inode_logged(trans, old_dir, NULL); 6809 + if (ret < 0) 6810 + goto out; 6811 + else if (ret == 0) 6812 + return; 6813 + } 6814 + ret = 0; 6974 6815 6975 6816 /* 6976 6817 * If we are doing a rename (old_dir is not NULL) from a directory that 6977 - * was previously logged, make sure the next log attempt on the directory 6978 - * is not skipped and logs the inode again. This is because the log may 6979 - * not currently be authoritative for a range including the old 6980 - * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we 6981 - * do not end up with both the new and old dentries around (in case the 6982 - * inode is a directory we would have a directory with two hard links and 6983 - * 2 inode references for different parents). The next log attempt of 6984 - * old_dir will happen at btrfs_log_all_parents(), called through 6985 - * btrfs_log_inode_parent() below, because we have previously set 6986 - * inode->last_unlink_trans to the current transaction ID, either here or 6987 - * at btrfs_record_unlink_dir() in case the inode is a directory. 6818 + * was previously logged, make sure that on log replay we get the old 6819 + * dir entry deleted. This is needed because we will also log the new 6820 + * name of the renamed inode, so we need to make sure that after log 6821 + * replay we don't end up with both the new and old dir entries existing. 6988 6822 */ 6989 - if (old_dir) 6990 - old_dir->logged_trans = 0; 6823 + if (old_dir && old_dir->logged_trans == trans->transid) { 6824 + struct btrfs_root *log = old_dir->root->log_root; 6825 + struct btrfs_path *path; 6826 + 6827 + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 6828 + 6829 + /* 6830 + * We have two inodes to update in the log, the old directory and 6831 + * the inode that got renamed, so we must pin the log to prevent 6832 + * anyone from syncing the log until we have updated both inodes 6833 + * in the log. 6834 + */ 6835 + log_pinned = true; 6836 + btrfs_pin_log_trans(root); 6837 + 6838 + path = btrfs_alloc_path(); 6839 + if (!path) { 6840 + ret = -ENOMEM; 6841 + goto out; 6842 + } 6843 + 6844 + /* 6845 + * Other concurrent task might be logging the old directory, 6846 + * as it can be triggered when logging other inode that had or 6847 + * still has a dentry in the old directory. So take the old 6848 + * directory's log_mutex to prevent getting an -EEXIST when 6849 + * logging a key to record the deletion, or having that other 6850 + * task logging the old directory get an -EEXIST if it attempts 6851 + * to log the same key after we just did it. In both cases that 6852 + * would result in falling back to a transaction commit. 6853 + */ 6854 + mutex_lock(&old_dir->log_mutex); 6855 + ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 6856 + old_dentry->d_name.name, 6857 + old_dentry->d_name.len, old_dir_index); 6858 + if (ret > 0) { 6859 + /* 6860 + * The dentry does not exist in the log, so record its 6861 + * deletion. 6862 + */ 6863 + btrfs_release_path(path); 6864 + ret = insert_dir_log_key(trans, log, path, 6865 + btrfs_ino(old_dir), 6866 + old_dir_index, old_dir_index); 6867 + } 6868 + mutex_unlock(&old_dir->log_mutex); 6869 + 6870 + btrfs_free_path(path); 6871 + if (ret < 0) 6872 + goto out; 6873 + } 6991 6874 6992 6875 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6993 6876 ctx.logging_new_name = true; ··· 7056 6825 * inconsistent state after a rename operation. 7057 6826 */ 7058 6827 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6828 + out: 6829 + /* 6830 + * If an error happened mark the log for a full commit because it's not 6831 + * consistent and up to date or we couldn't find out if one of the 6832 + * inodes was logged before in this transaction. Do it before unpinning 6833 + * the log, to avoid any races with someone else trying to commit it. 6834 + */ 6835 + if (ret < 0) 6836 + btrfs_set_log_full_commit(trans); 6837 + if (log_pinned) 6838 + btrfs_end_log_trans(root); 7059 6839 } 7060 6840

+5 -2

fs/btrfs/tree-log.h

··· 17 17 int log_transid; 18 18 bool log_new_dentries; 19 19 bool logging_new_name; 20 + /* Indicate if the inode being logged was logged before. */ 21 + bool logged_before; 20 22 /* Tracks the last logged dir item/index key offset. */ 21 23 u64 last_dir_item_offset; 22 24 struct inode *inode; ··· 34 32 ctx->log_transid = 0; 35 33 ctx->log_new_dentries = false; 36 34 ctx->logging_new_name = false; 35 + ctx->logged_before = false; 37 36 ctx->inode = inode; 38 37 INIT_LIST_HEAD(&ctx->list); 39 38 INIT_LIST_HEAD(&ctx->ordered_extents); ··· 89 86 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 90 87 struct btrfs_inode *dir); 91 88 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 92 - struct btrfs_inode *inode, struct btrfs_inode *old_dir, 93 - struct dentry *parent); 89 + struct dentry *old_dentry, struct btrfs_inode *old_dir, 90 + u64 old_dir_index, struct dentry *parent); 94 91 95 92 #endif

+77 -70

fs/btrfs/volumes.c

··· 534 534 return ret; 535 535 } 536 536 537 - static bool device_path_matched(const char *path, struct btrfs_device *device) 538 - { 539 - int found; 540 - 541 - rcu_read_lock(); 542 - found = strcmp(rcu_str_deref(device->name), path); 543 - rcu_read_unlock(); 544 - 545 - return found == 0; 546 - } 547 - 548 - /* 549 - * Search and remove all stale (devices which are not mounted) devices. 537 + /** 538 + * Search and remove all stale devices (which are not mounted). 550 539 * When both inputs are NULL, it will search and release all stale devices. 551 - * path: Optional. When provided will it release all unmounted devices 552 - * matching this path only. 553 - * skip_dev: Optional. Will skip this device when searching for the stale 540 + * 541 + * @devt: Optional. When provided will it release all unmounted devices 542 + * matching this devt only. 543 + * @skip_device: Optional. Will skip this device when searching for the stale 554 544 * devices. 555 - * Return: 0 for success or if @path is NULL. 556 - * -EBUSY if @path is a mounted device. 557 - * -ENOENT if @path does not match any device in the list. 545 + * 546 + * Return: 0 for success or if @devt is 0. 547 + * -EBUSY if @devt is a mounted device. 548 + * -ENOENT if @devt does not match any device in the list. 558 549 */ 559 - static int btrfs_free_stale_devices(const char *path, 560 - struct btrfs_device *skip_device) 550 + static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 561 551 { 562 552 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 563 553 struct btrfs_device *device, *tmp_device; ··· 555 565 556 566 lockdep_assert_held(&uuid_mutex); 557 567 558 - if (path) 568 + if (devt) 559 569 ret = -ENOENT; 560 570 561 571 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { ··· 565 575 &fs_devices->devices, dev_list) { 566 576 if (skip_device && skip_device == device) 567 577 continue; 568 - if (path && !device->name) 569 - continue; 570 - if (path && !device_path_matched(path, device)) 578 + if (devt && devt != device->devt) 571 579 continue; 572 580 if (fs_devices->opened) { 573 581 /* for an already deleted device return 0 */ 574 - if (path && ret != 0) 582 + if (devt && ret != 0) 575 583 ret = -EBUSY; 576 584 break; 577 585 } ··· 602 614 struct btrfs_device *device, fmode_t flags, 603 615 void *holder) 604 616 { 605 - struct request_queue *q; 606 617 struct block_device *bdev; 607 618 struct btrfs_super_block *disk_super; 608 619 u64 devid; ··· 643 656 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 644 657 } 645 658 646 - q = bdev_get_queue(bdev); 647 - if (!blk_queue_nonrot(q)) 659 + if (!blk_queue_nonrot(bdev_get_queue(bdev))) 648 660 fs_devices->rotating = true; 649 661 650 662 device->bdev = bdev; ··· 767 781 struct rcu_string *name; 768 782 u64 found_transid = btrfs_super_generation(disk_super); 769 783 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 784 + dev_t path_devt; 785 + int error; 770 786 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 771 787 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 772 788 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 773 789 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 790 + 791 + error = lookup_bdev(path, &path_devt); 792 + if (error) 793 + return ERR_PTR(error); 774 794 775 795 if (fsid_change_in_progress) { 776 796 if (!has_metadata_uuid) ··· 860 868 return ERR_PTR(-ENOMEM); 861 869 } 862 870 rcu_assign_pointer(device->name, name); 871 + device->devt = path_devt; 863 872 864 873 list_add_rcu(&device->dev_list, &fs_devices->devices); 865 874 fs_devices->num_devices++; ··· 921 928 /* 922 929 * We are going to replace the device path for a given devid, 923 930 * make sure it's the same device if the device is mounted 931 + * 932 + * NOTE: the device->fs_info may not be reliable here so pass 933 + * in a NULL to message helpers instead. This avoids a possible 934 + * use-after-free when the fs_info and fs_info->sb are already 935 + * torn down. 924 936 */ 925 937 if (device->bdev) { 926 - int error; 927 - dev_t path_dev; 928 - 929 - error = lookup_bdev(path, &path_dev); 930 - if (error) { 938 + if (device->devt != path_devt) { 931 939 mutex_unlock(&fs_devices->device_list_mutex); 932 - return ERR_PTR(error); 933 - } 934 - 935 - if (device->bdev->bd_dev != path_dev) { 936 - mutex_unlock(&fs_devices->device_list_mutex); 937 - /* 938 - * device->fs_info may not be reliable here, so 939 - * pass in a NULL instead. This avoids a 940 - * possible use-after-free when the fs_info and 941 - * fs_info->sb are already torn down. 942 - */ 943 940 btrfs_warn_in_rcu(NULL, 944 941 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 945 942 path, devid, found_transid, ··· 937 954 task_pid_nr(current)); 938 955 return ERR_PTR(-EEXIST); 939 956 } 940 - btrfs_info_in_rcu(device->fs_info, 957 + btrfs_info_in_rcu(NULL, 941 958 "devid %llu device path %s changed to %s scanned by %s (%d)", 942 959 devid, rcu_str_deref(device->name), 943 960 path, current->comm, ··· 955 972 fs_devices->missing_devices--; 956 973 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 957 974 } 975 + device->devt = path_devt; 958 976 } 959 977 960 978 /* ··· 1315 1331 return disk_super; 1316 1332 } 1317 1333 1318 - int btrfs_forget_devices(const char *path) 1334 + int btrfs_forget_devices(dev_t devt) 1319 1335 { 1320 1336 int ret; 1321 1337 1322 1338 mutex_lock(&uuid_mutex); 1323 - ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); 1339 + ret = btrfs_free_stale_devices(devt, NULL); 1324 1340 mutex_unlock(&uuid_mutex); 1325 1341 1326 1342 return ret; ··· 1369 1385 } 1370 1386 1371 1387 device = device_list_add(path, disk_super, &new_device_added); 1372 - if (!IS_ERR(device)) { 1373 - if (new_device_added) 1374 - btrfs_free_stale_devices(path, device); 1375 - } 1388 + if (!IS_ERR(device) && new_device_added) 1389 + btrfs_free_stale_devices(device->devt, device); 1376 1390 1377 1391 btrfs_release_disk_super(disk_super); 1378 1392 ··· 2084 2102 u64 num_devices; 2085 2103 int ret = 0; 2086 2104 2105 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2106 + btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); 2107 + return -EINVAL; 2108 + } 2109 + 2087 2110 /* 2088 2111 * The device list in fs_devices is accessed without locks (neither 2089 2112 * uuid_mutex nor device_list_mutex) as it won't change on a mounted ··· 2593 2606 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2594 2607 { 2595 2608 struct btrfs_root *root = fs_info->dev_root; 2596 - struct request_queue *q; 2597 2609 struct btrfs_trans_handle *trans; 2598 2610 struct btrfs_device *device; 2599 2611 struct block_device *bdev; ··· 2654 2668 2655 2669 device->fs_info = fs_info; 2656 2670 device->bdev = bdev; 2671 + ret = lookup_bdev(device_path, &device->devt); 2672 + if (ret) 2673 + goto error_free_device; 2657 2674 2658 2675 ret = btrfs_get_dev_zone_info(device, false); 2659 2676 if (ret) ··· 2668 2679 goto error_free_zone; 2669 2680 } 2670 2681 2671 - q = bdev_get_queue(bdev); 2672 2682 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2673 2683 device->generation = trans->transid; 2674 2684 device->io_width = fs_info->sectorsize; ··· 2715 2727 2716 2728 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2717 2729 2718 - if (!blk_queue_nonrot(q)) 2730 + if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2719 2731 fs_devices->rotating = true; 2720 2732 2721 2733 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); ··· 2802 2814 * We can ignore the return value as it typically returns -EINVAL and 2803 2815 * only succeeds if the device was an alien. 2804 2816 */ 2805 - btrfs_forget_devices(device_path); 2817 + btrfs_forget_devices(device->devt); 2806 2818 2807 2819 /* Update ctime/mtime for blkid or udev */ 2808 2820 update_dev_time(device_path); ··· 3238 3250 struct btrfs_block_group *block_group; 3239 3251 u64 length; 3240 3252 int ret; 3253 + 3254 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3255 + btrfs_err(fs_info, 3256 + "relocate: not supported on extent tree v2 yet"); 3257 + return -EINVAL; 3258 + } 3241 3259 3242 3260 /* 3243 3261 * Prevent races with automatic removal of unused block groups. ··· 7054 7060 } 7055 7061 #endif 7056 7062 7063 + static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 7064 + u64 devid, u8 *uuid) 7065 + { 7066 + struct btrfs_device *dev; 7067 + 7068 + if (!btrfs_test_opt(fs_info, DEGRADED)) { 7069 + btrfs_report_missing_device(fs_info, devid, uuid, true); 7070 + return ERR_PTR(-ENOENT); 7071 + } 7072 + 7073 + dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 7074 + if (IS_ERR(dev)) { 7075 + btrfs_err(fs_info, "failed to init missing device %llu: %ld", 7076 + devid, PTR_ERR(dev)); 7077 + return dev; 7078 + } 7079 + btrfs_report_missing_device(fs_info, devid, uuid, false); 7080 + 7081 + return dev; 7082 + } 7083 + 7057 7084 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7058 7085 struct btrfs_chunk *chunk) 7059 7086 { ··· 7162 7147 BTRFS_UUID_SIZE); 7163 7148 args.uuid = uuid; 7164 7149 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7165 - if (!map->stripes[i].dev && 7166 - !btrfs_test_opt(fs_info, DEGRADED)) { 7167 - free_extent_map(em); 7168 - btrfs_report_missing_device(fs_info, devid, uuid, true); 7169 - return -ENOENT; 7170 - } 7171 7150 if (!map->stripes[i].dev) { 7172 - map->stripes[i].dev = 7173 - add_missing_dev(fs_info->fs_devices, devid, 7174 - uuid); 7151 + map->stripes[i].dev = handle_missing_device(fs_info, 7152 + devid, uuid); 7175 7153 if (IS_ERR(map->stripes[i].dev)) { 7176 7154 free_extent_map(em); 7177 - btrfs_err(fs_info, 7178 - "failed to init missing dev %llu: %ld", 7179 - devid, PTR_ERR(map->stripes[i].dev)); 7180 7155 return PTR_ERR(map->stripes[i].dev); 7181 7156 } 7182 - btrfs_report_missing_device(fs_info, devid, uuid, false); 7183 7157 } 7158 + 7184 7159 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7185 7160 &(map->stripes[i].dev->dev_state)); 7186 - 7187 7161 } 7188 7162 7189 7163 write_lock(&map_tree->lock); ··· 8303 8299 target = cache->start; 8304 8300 btrfs_put_block_group(cache); 8305 8301 8302 + sb_start_write(fs_info->sb); 8306 8303 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8307 8304 btrfs_info(fs_info, 8308 8305 "zoned: skip relocating block group %llu to repair: EBUSY", 8309 8306 target); 8307 + sb_end_write(fs_info->sb); 8310 8308 return -EBUSY; 8311 8309 } 8312 8310 ··· 8336 8330 btrfs_put_block_group(cache); 8337 8331 mutex_unlock(&fs_info->reclaim_bgs_lock); 8338 8332 btrfs_exclop_finish(fs_info); 8333 + sb_end_write(fs_info->sb); 8339 8334 8340 8335 return ret; 8341 8336 }

+6 -1

fs/btrfs/volumes.h

··· 72 72 /* the mode sent to blkdev_get */ 73 73 fmode_t mode; 74 74 75 + /* 76 + * Device's major-minor number. Must be set even if the device is not 77 + * opened (bdev == NULL), unless the device is missing. 78 + */ 79 + dev_t devt; 75 80 unsigned long dev_state; 76 81 blk_status_t last_flush_error; 77 82 ··· 510 505 fmode_t flags, void *holder); 511 506 struct btrfs_device *btrfs_scan_one_device(const char *path, 512 507 fmode_t flags, void *holder); 513 - int btrfs_forget_devices(const char *path); 508 + int btrfs_forget_devices(dev_t devt); 514 509 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 515 510 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); 516 511 void btrfs_assign_next_active_device(struct btrfs_device *device,

+111 -62

fs/btrfs/zoned.c

··· 652 652 if (model == BLK_ZONED_HM || 653 653 (model == BLK_ZONED_HA && incompat_zoned) || 654 654 (model == BLK_ZONED_NONE && incompat_zoned)) { 655 - struct btrfs_zoned_device_info *zone_info = 656 - device->zone_info; 655 + struct btrfs_zoned_device_info *zone_info; 657 656 658 657 zone_info = device->zone_info; 659 658 zoned_devices++; ··· 1214 1215 struct btrfs_device *device; 1215 1216 u64 logical = cache->start; 1216 1217 u64 length = cache->length; 1217 - u64 physical = 0; 1218 1218 int ret; 1219 1219 int i; 1220 1220 unsigned int nofs_flag; 1221 1221 u64 *alloc_offsets = NULL; 1222 1222 u64 *caps = NULL; 1223 + u64 *physical = NULL; 1223 1224 unsigned long *active = NULL; 1224 1225 u64 last_alloc = 0; 1225 1226 u32 num_sequential = 0, num_conventional = 0; ··· 1263 1264 goto out; 1264 1265 } 1265 1266 1267 + physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); 1268 + if (!physical) { 1269 + ret = -ENOMEM; 1270 + goto out; 1271 + } 1272 + 1266 1273 active = bitmap_zalloc(map->num_stripes, GFP_NOFS); 1267 1274 if (!active) { 1268 1275 ret = -ENOMEM; ··· 1282 1277 int dev_replace_is_ongoing = 0; 1283 1278 1284 1279 device = map->stripes[i].dev; 1285 - physical = map->stripes[i].physical; 1280 + physical[i] = map->stripes[i].physical; 1286 1281 1287 1282 if (device->bdev == NULL) { 1288 1283 alloc_offsets[i] = WP_MISSING_DEV; 1289 1284 continue; 1290 1285 } 1291 1286 1292 - is_sequential = btrfs_dev_is_sequential(device, physical); 1287 + is_sequential = btrfs_dev_is_sequential(device, physical[i]); 1293 1288 if (is_sequential) 1294 1289 num_sequential++; 1295 1290 else ··· 1304 1299 * This zone will be used for allocation, so mark this zone 1305 1300 * non-empty. 1306 1301 */ 1307 - btrfs_dev_clear_zone_empty(device, physical); 1302 + btrfs_dev_clear_zone_empty(device, physical[i]); 1308 1303 1309 1304 down_read(&dev_replace->rwsem); 1310 1305 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1311 1306 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1312 - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); 1307 + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); 1313 1308 up_read(&dev_replace->rwsem); 1314 1309 1315 1310 /* 1316 1311 * The group is mapped to a sequential zone. Get the zone write 1317 1312 * pointer to determine the allocation offset within the zone. 1318 1313 */ 1319 - WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); 1314 + WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); 1320 1315 nofs_flag = memalloc_nofs_save(); 1321 - ret = btrfs_get_dev_zone(device, physical, &zone); 1316 + ret = btrfs_get_dev_zone(device, physical[i], &zone); 1322 1317 memalloc_nofs_restore(nofs_flag); 1323 1318 if (ret == -EIO || ret == -EOPNOTSUPP) { 1324 1319 ret = 0; ··· 1344 1339 case BLK_ZONE_COND_READONLY: 1345 1340 btrfs_err(fs_info, 1346 1341 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1347 - physical >> device->zone_info->zone_size_shift, 1342 + physical[i] >> device->zone_info->zone_size_shift, 1348 1343 rcu_str_deref(device->name), device->devid); 1349 1344 alloc_offsets[i] = WP_MISSING_DEV; 1350 1345 break; ··· 1409 1404 if (alloc_offsets[0] == WP_MISSING_DEV) { 1410 1405 btrfs_err(fs_info, 1411 1406 "zoned: cannot recover write pointer for zone %llu", 1412 - physical); 1407 + physical[0]); 1413 1408 ret = -EIO; 1414 1409 goto out; 1415 1410 } ··· 1418 1413 cache->zone_is_active = test_bit(0, active); 1419 1414 break; 1420 1415 case BTRFS_BLOCK_GROUP_DUP: 1416 + if (map->type & BTRFS_BLOCK_GROUP_DATA) { 1417 + btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); 1418 + ret = -EINVAL; 1419 + goto out; 1420 + } 1421 + if (alloc_offsets[0] == WP_MISSING_DEV) { 1422 + btrfs_err(fs_info, 1423 + "zoned: cannot recover write pointer for zone %llu", 1424 + physical[0]); 1425 + ret = -EIO; 1426 + goto out; 1427 + } 1428 + if (alloc_offsets[1] == WP_MISSING_DEV) { 1429 + btrfs_err(fs_info, 1430 + "zoned: cannot recover write pointer for zone %llu", 1431 + physical[1]); 1432 + ret = -EIO; 1433 + goto out; 1434 + } 1435 + if (alloc_offsets[0] != alloc_offsets[1]) { 1436 + btrfs_err(fs_info, 1437 + "zoned: write pointer offset mismatch of zones in DUP profile"); 1438 + ret = -EIO; 1439 + goto out; 1440 + } 1441 + if (test_bit(0, active) != test_bit(1, active)) { 1442 + if (!btrfs_zone_activate(cache)) { 1443 + ret = -EIO; 1444 + goto out; 1445 + } 1446 + } else { 1447 + cache->zone_is_active = test_bit(0, active); 1448 + } 1449 + cache->alloc_offset = alloc_offsets[0]; 1450 + cache->zone_capacity = min(caps[0], caps[1]); 1451 + break; 1421 1452 case BTRFS_BLOCK_GROUP_RAID1: 1422 1453 case BTRFS_BLOCK_GROUP_RAID0: 1423 1454 case BTRFS_BLOCK_GROUP_RAID10: ··· 1506 1465 cache->physical_map = NULL; 1507 1466 } 1508 1467 bitmap_free(active); 1468 + kfree(physical); 1509 1469 kfree(caps); 1510 1470 kfree(alloc_offsets); 1511 1471 free_extent_map(em); ··· 1823 1781 struct btrfs_device *device; 1824 1782 u64 physical; 1825 1783 bool ret; 1784 + int i; 1826 1785 1827 1786 if (!btrfs_is_zoned(block_group->fs_info)) 1828 1787 return true; 1829 1788 1830 1789 map = block_group->physical_map; 1831 - /* Currently support SINGLE profile only */ 1832 - ASSERT(map->num_stripes == 1); 1833 - device = map->stripes[0].dev; 1834 - physical = map->stripes[0].physical; 1835 - 1836 - if (device->zone_info->max_active_zones == 0) 1837 - return true; 1838 1790 1839 1791 spin_lock(&block_group->lock); 1840 - 1841 1792 if (block_group->zone_is_active) { 1842 1793 ret = true; 1843 1794 goto out_unlock; 1844 1795 } 1845 1796 1846 - /* No space left */ 1847 - if (block_group->alloc_offset == block_group->zone_capacity) { 1848 - ret = false; 1849 - goto out_unlock; 1797 + for (i = 0; i < map->num_stripes; i++) { 1798 + device = map->stripes[i].dev; 1799 + physical = map->stripes[i].physical; 1800 + 1801 + if (device->zone_info->max_active_zones == 0) 1802 + continue; 1803 + 1804 + /* No space left */ 1805 + if (block_group->alloc_offset == block_group->zone_capacity) { 1806 + ret = false; 1807 + goto out_unlock; 1808 + } 1809 + 1810 + if (!btrfs_dev_set_active_zone(device, physical)) { 1811 + /* Cannot activate the zone */ 1812 + ret = false; 1813 + goto out_unlock; 1814 + } 1815 + 1816 + /* Successfully activated all the zones */ 1817 + if (i == map->num_stripes - 1) 1818 + block_group->zone_is_active = 1; 1819 + 1820 + 1850 1821 } 1851 - 1852 - if (!btrfs_dev_set_active_zone(device, physical)) { 1853 - /* Cannot activate the zone */ 1854 - ret = false; 1855 - goto out_unlock; 1856 - } 1857 - 1858 - /* Successfully activated all the zones */ 1859 - block_group->zone_is_active = 1; 1860 - 1861 1822 spin_unlock(&block_group->lock); 1862 1823 1863 - /* For the active block group list */ 1864 - btrfs_get_block_group(block_group); 1824 + if (block_group->zone_is_active) { 1825 + /* For the active block group list */ 1826 + btrfs_get_block_group(block_group); 1865 1827 1866 - spin_lock(&fs_info->zone_active_bgs_lock); 1867 - ASSERT(list_empty(&block_group->active_bg_list)); 1868 - list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); 1869 - spin_unlock(&fs_info->zone_active_bgs_lock); 1828 + spin_lock(&fs_info->zone_active_bgs_lock); 1829 + list_add_tail(&block_group->active_bg_list, 1830 + &fs_info->zone_active_bgs); 1831 + spin_unlock(&fs_info->zone_active_bgs_lock); 1832 + } 1870 1833 1871 1834 return true; 1872 1835 ··· 1887 1840 struct btrfs_device *device; 1888 1841 u64 physical; 1889 1842 int ret = 0; 1843 + int i; 1890 1844 1891 1845 if (!btrfs_is_zoned(fs_info)) 1892 1846 return 0; 1893 1847 1894 1848 map = block_group->physical_map; 1895 - /* Currently support SINGLE profile only */ 1896 - ASSERT(map->num_stripes == 1); 1897 - 1898 - device = map->stripes[0].dev; 1899 - physical = map->stripes[0].physical; 1900 - 1901 - if (device->zone_info->max_active_zones == 0) 1902 - return 0; 1903 1849 1904 1850 spin_lock(&block_group->lock); 1905 1851 if (!block_group->zone_is_active) { ··· 1944 1904 btrfs_clear_data_reloc_bg(block_group); 1945 1905 spin_unlock(&block_group->lock); 1946 1906 1947 - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 1948 - physical >> SECTOR_SHIFT, 1949 - device->zone_info->zone_size >> SECTOR_SHIFT, 1950 - GFP_NOFS); 1907 + for (i = 0; i < map->num_stripes; i++) { 1908 + device = map->stripes[i].dev; 1909 + physical = map->stripes[i].physical; 1910 + 1911 + if (device->zone_info->max_active_zones == 0) 1912 + continue; 1913 + 1914 + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 1915 + physical >> SECTOR_SHIFT, 1916 + device->zone_info->zone_size >> SECTOR_SHIFT, 1917 + GFP_NOFS); 1918 + 1919 + if (ret) 1920 + return ret; 1921 + 1922 + btrfs_dev_clear_active_zone(device, physical); 1923 + } 1951 1924 btrfs_dec_block_group_ro(block_group); 1952 1925 1953 - if (!ret) { 1954 - btrfs_dev_clear_active_zone(device, physical); 1926 + spin_lock(&fs_info->zone_active_bgs_lock); 1927 + ASSERT(!list_empty(&block_group->active_bg_list)); 1928 + list_del_init(&block_group->active_bg_list); 1929 + spin_unlock(&fs_info->zone_active_bgs_lock); 1955 1930 1956 - spin_lock(&fs_info->zone_active_bgs_lock); 1957 - ASSERT(!list_empty(&block_group->active_bg_list)); 1958 - list_del_init(&block_group->active_bg_list); 1959 - spin_unlock(&fs_info->zone_active_bgs_lock); 1931 + /* For active_bg_list */ 1932 + btrfs_put_block_group(block_group); 1960 1933 1961 - /* For active_bg_list */ 1962 - btrfs_put_block_group(block_group); 1963 - } 1964 - 1965 - return ret; 1934 + return 0; 1966 1935 } 1967 1936 1968 1937 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)

-5

fs/internal.h

··· 158 158 extern void shrink_dentry_list(struct list_head *); 159 159 160 160 /* 161 - * read_write.c 162 - */ 163 - extern int rw_verify_area(int, struct file *, const loff_t *, size_t); 164 - 165 - /* 166 161 * pipe.c 167 162 */ 168 163 extern const struct file_operations pipefifo_fops;

-4

fs/ioctl.c

··· 236 236 237 237 if (!src_file.file) 238 238 return -EBADF; 239 - ret = -EXDEV; 240 - if (src_file.file->f_path.mnt != dst_file->f_path.mnt) 241 - goto fdput; 242 239 cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, 243 240 olen, 0); 244 241 if (cloned < 0) ··· 244 247 ret = -EINVAL; 245 248 else 246 249 ret = 0; 247 - fdput: 248 250 fdput(src_file); 249 251 return ret; 250 252 }

+21 -13

fs/read_write.c

··· 385 385 return security_file_permission(file, 386 386 read_write == READ ? MAY_READ : MAY_WRITE); 387 387 } 388 + EXPORT_SYMBOL(rw_verify_area); 388 389 389 390 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 390 391 { ··· 1618 1617 return 0; 1619 1618 } 1620 1619 1621 - /* 1622 - * Performs necessary checks before doing a write 1623 - * 1624 - * Can adjust writing position or amount of bytes to write. 1625 - * Returns appropriate error code that caller should return or 1626 - * zero in case that write should be allowed. 1627 - */ 1628 - ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 1620 + /* Like generic_write_checks(), but takes size of write instead of iter. */ 1621 + int generic_write_checks_count(struct kiocb *iocb, loff_t *count) 1629 1622 { 1630 1623 struct file *file = iocb->ki_filp; 1631 1624 struct inode *inode = file->f_mapping->host; 1632 - loff_t count; 1633 - int ret; 1634 1625 1635 1626 if (IS_SWAPFILE(inode)) 1636 1627 return -ETXTBSY; 1637 1628 1638 - if (!iov_iter_count(from)) 1629 + if (!*count) 1639 1630 return 0; 1640 1631 1641 1632 /* FIXME: this is for backwards compatibility with 2.4 */ ··· 1637 1644 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 1638 1645 return -EINVAL; 1639 1646 1640 - count = iov_iter_count(from); 1641 - ret = generic_write_check_limits(file, iocb->ki_pos, &count); 1647 + return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); 1648 + } 1649 + EXPORT_SYMBOL(generic_write_checks_count); 1650 + 1651 + /* 1652 + * Performs necessary checks before doing a write 1653 + * 1654 + * Can adjust writing position or amount of bytes to write. 1655 + * Returns appropriate error code that caller should return or 1656 + * zero in case that write should be allowed. 1657 + */ 1658 + ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 1659 + { 1660 + loff_t count = iov_iter_count(from); 1661 + int ret; 1662 + 1663 + ret = generic_write_checks_count(iocb, &count); 1642 1664 if (ret) 1643 1665 return ret; 1644 1666

+1 -6

fs/remap_range.c

··· 362 362 363 363 WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); 364 364 365 - /* 366 - * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on 367 - * the same mount. Practically, they only need to be on the same file 368 - * system. 369 - */ 370 365 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) 371 366 return -EXDEV; 372 367 ··· 453 458 goto out_drop_write; 454 459 455 460 ret = -EXDEV; 456 - if (src_file->f_path.mnt != dst_file->f_path.mnt) 461 + if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb) 457 462 goto out_drop_write; 458 463 459 464 ret = -EISDIR;

+2

include/linux/fs.h

··· 3130 3130 extern int generic_file_mmap(struct file *, struct vm_area_struct *); 3131 3131 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); 3132 3132 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); 3133 + int generic_write_checks_count(struct kiocb *iocb, loff_t *count); 3133 3134 extern int generic_write_check_limits(struct file *file, loff_t pos, 3134 3135 loff_t *count); 3135 3136 extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); ··· 3174 3173 int whence, loff_t size); 3175 3174 extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); 3176 3175 extern loff_t no_seek_end_llseek(struct file *, loff_t, int); 3176 + int rw_verify_area(int, struct file *, const loff_t *, size_t); 3177 3177 extern int generic_file_open(struct inode * inode, struct file * filp); 3178 3178 extern int nonseekable_open(struct inode * inode, struct file * filp); 3179 3179 extern int stream_open(struct inode * inode, struct file * filp);

+1

include/trace/events/btrfs.h

··· 53 53 { BTRFS_TREE_RELOC_OBJECTID, "TREE_RELOC" }, \ 54 54 { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, \ 55 55 { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, \ 56 + { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },\ 56 57 { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }) 57 58 58 59 #define show_root_type(obj) \

+133

include/uapi/linux/btrfs.h

··· 309 309 #define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10) 310 310 #define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11) 311 311 #define BTRFS_FEATURE_INCOMPAT_ZONED (1ULL << 12) 312 + #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) 312 313 313 314 struct btrfs_ioctl_feature_flags { 314 315 __u64 compat_flags; ··· 869 868 __u8 align[7]; 870 869 }; 871 870 871 + /* 872 + * Data and metadata for an encoded read or write. 873 + * 874 + * Encoded I/O bypasses any encoding automatically done by the filesystem (e.g., 875 + * compression). This can be used to read the compressed contents of a file or 876 + * write pre-compressed data directly to a file. 877 + * 878 + * BTRFS_IOC_ENCODED_READ and BTRFS_IOC_ENCODED_WRITE are essentially 879 + * preadv/pwritev with additional metadata about how the data is encoded and the 880 + * size of the unencoded data. 881 + * 882 + * BTRFS_IOC_ENCODED_READ fills the given iovecs with the encoded data, fills 883 + * the metadata fields, and returns the size of the encoded data. It reads one 884 + * extent per call. It can also read data which is not encoded. 885 + * 886 + * BTRFS_IOC_ENCODED_WRITE uses the metadata fields, writes the encoded data 887 + * from the iovecs, and returns the size of the encoded data. Note that the 888 + * encoded data is not validated when it is written; if it is not valid (e.g., 889 + * it cannot be decompressed), then a subsequent read may return an error. 890 + * 891 + * Since the filesystem page cache contains decoded data, encoded I/O bypasses 892 + * the page cache. Encoded I/O requires CAP_SYS_ADMIN. 893 + */ 894 + struct btrfs_ioctl_encoded_io_args { 895 + /* Input parameters for both reads and writes. */ 896 + 897 + /* 898 + * iovecs containing encoded data. 899 + * 900 + * For reads, if the size of the encoded data is larger than the sum of 901 + * iov[n].iov_len for 0 <= n < iovcnt, then the ioctl fails with 902 + * ENOBUFS. 903 + * 904 + * For writes, the size of the encoded data is the sum of iov[n].iov_len 905 + * for 0 <= n < iovcnt. This must be less than 128 KiB (this limit may 906 + * increase in the future). This must also be less than or equal to 907 + * unencoded_len. 908 + */ 909 + const struct iovec __user *iov; 910 + /* Number of iovecs. */ 911 + unsigned long iovcnt; 912 + /* 913 + * Offset in file. 914 + * 915 + * For writes, must be aligned to the sector size of the filesystem. 916 + */ 917 + __s64 offset; 918 + /* Currently must be zero. */ 919 + __u64 flags; 920 + 921 + /* 922 + * For reads, the following members are output parameters that will 923 + * contain the returned metadata for the encoded data. 924 + * For writes, the following members must be set to the metadata for the 925 + * encoded data. 926 + */ 927 + 928 + /* 929 + * Length of the data in the file. 930 + * 931 + * Must be less than or equal to unencoded_len - unencoded_offset. For 932 + * writes, must be aligned to the sector size of the filesystem unless 933 + * the data ends at or beyond the current end of the file. 934 + */ 935 + __u64 len; 936 + /* 937 + * Length of the unencoded (i.e., decrypted and decompressed) data. 938 + * 939 + * For writes, must be no more than 128 KiB (this limit may increase in 940 + * the future). If the unencoded data is actually longer than 941 + * unencoded_len, then it is truncated; if it is shorter, then it is 942 + * extended with zeroes. 943 + */ 944 + __u64 unencoded_len; 945 + /* 946 + * Offset from the first byte of the unencoded data to the first byte of 947 + * logical data in the file. 948 + * 949 + * Must be less than unencoded_len. 950 + */ 951 + __u64 unencoded_offset; 952 + /* 953 + * BTRFS_ENCODED_IO_COMPRESSION_* type. 954 + * 955 + * For writes, must not be BTRFS_ENCODED_IO_COMPRESSION_NONE. 956 + */ 957 + __u32 compression; 958 + /* Currently always BTRFS_ENCODED_IO_ENCRYPTION_NONE. */ 959 + __u32 encryption; 960 + /* 961 + * Reserved for future expansion. 962 + * 963 + * For reads, always returned as zero. Users should check for non-zero 964 + * bytes. If there are any, then the kernel has a newer version of this 965 + * structure with additional information that the user definition is 966 + * missing. 967 + * 968 + * For writes, must be zeroed. 969 + */ 970 + __u8 reserved[64]; 971 + }; 972 + 973 + /* Data is not compressed. */ 974 + #define BTRFS_ENCODED_IO_COMPRESSION_NONE 0 975 + /* Data is compressed as a single zlib stream. */ 976 + #define BTRFS_ENCODED_IO_COMPRESSION_ZLIB 1 977 + /* 978 + * Data is compressed as a single zstd frame with the windowLog compression 979 + * parameter set to no more than 17. 980 + */ 981 + #define BTRFS_ENCODED_IO_COMPRESSION_ZSTD 2 982 + /* 983 + * Data is compressed sector by sector (using the sector size indicated by the 984 + * name of the constant) with LZO1X and wrapped in the format documented in 985 + * fs/btrfs/lzo.c. For writes, the compression sector size must match the 986 + * filesystem sector size. 987 + */ 988 + #define BTRFS_ENCODED_IO_COMPRESSION_LZO_4K 3 989 + #define BTRFS_ENCODED_IO_COMPRESSION_LZO_8K 4 990 + #define BTRFS_ENCODED_IO_COMPRESSION_LZO_16K 5 991 + #define BTRFS_ENCODED_IO_COMPRESSION_LZO_32K 6 992 + #define BTRFS_ENCODED_IO_COMPRESSION_LZO_64K 7 993 + #define BTRFS_ENCODED_IO_COMPRESSION_TYPES 8 994 + 995 + /* Data is not encrypted. */ 996 + #define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0 997 + #define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1 998 + 872 999 /* Error codes as returned by the kernel */ 873 1000 enum btrfs_err_code { 874 1001 BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, ··· 1125 996 struct btrfs_ioctl_ino_lookup_user_args) 1126 997 #define BTRFS_IOC_SNAP_DESTROY_V2 _IOW(BTRFS_IOCTL_MAGIC, 63, \ 1127 998 struct btrfs_ioctl_vol_args_v2) 999 + #define BTRFS_IOC_ENCODED_READ _IOR(BTRFS_IOCTL_MAGIC, 64, \ 1000 + struct btrfs_ioctl_encoded_io_args) 1001 + #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ 1002 + struct btrfs_ioctl_encoded_io_args) 1128 1003 1129 1004 #endif /* _UAPI_LINUX_BTRFS_H */

+3

include/uapi/linux/btrfs_tree.h

··· 53 53 /* tracks free space in block groups. */ 54 54 #define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL 55 55 56 + /* Holds the block group items for extent tree v2. */ 57 + #define BTRFS_BLOCK_GROUP_TREE_OBJECTID 11ULL 58 + 56 59 /* device stats in the device tree */ 57 60 #define BTRFS_DEV_STATS_OBJECTID 0ULL 58 61

Configure Feed

Configure Feed