Merge tag 'for-4.20-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+3

fs/btrfs/ctree.h

··· 3163 3163 int btrfs_drop_inode(struct inode *inode); 3164 3164 int __init btrfs_init_cachep(void); 3165 3165 void __cold btrfs_destroy_cachep(void); 3166 + struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, 3167 + struct btrfs_root *root, int *new, 3168 + struct btrfs_path *path); 3166 3169 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3167 3170 struct btrfs_root *root, int *was_new); 3168 3171 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,

+26 -37

fs/btrfs/disk-io.c

··· 1664 1664 struct btrfs_root *root = arg; 1665 1665 struct btrfs_fs_info *fs_info = root->fs_info; 1666 1666 int again; 1667 - struct btrfs_trans_handle *trans; 1668 1667 1669 - do { 1668 + while (1) { 1670 1669 again = 0; 1671 1670 1672 1671 /* Make the cleaner go to sleep early. */ ··· 1714 1715 */ 1715 1716 btrfs_delete_unused_bgs(fs_info); 1716 1717 sleep: 1718 + if (kthread_should_park()) 1719 + kthread_parkme(); 1720 + if (kthread_should_stop()) 1721 + return 0; 1717 1722 if (!again) { 1718 1723 set_current_state(TASK_INTERRUPTIBLE); 1719 - if (!kthread_should_stop()) 1720 - schedule(); 1724 + schedule(); 1721 1725 __set_current_state(TASK_RUNNING); 1722 1726 } 1723 - } while (!kthread_should_stop()); 1724 - 1725 - /* 1726 - * Transaction kthread is stopped before us and wakes us up. 1727 - * However we might have started a new transaction and COWed some 1728 - * tree blocks when deleting unused block groups for example. So 1729 - * make sure we commit the transaction we started to have a clean 1730 - * shutdown when evicting the btree inode - if it has dirty pages 1731 - * when we do the final iput() on it, eviction will trigger a 1732 - * writeback for it which will fail with null pointer dereferences 1733 - * since work queues and other resources were already released and 1734 - * destroyed by the time the iput/eviction/writeback is made. 1735 - */ 1736 - trans = btrfs_attach_transaction(root); 1737 - if (IS_ERR(trans)) { 1738 - if (PTR_ERR(trans) != -ENOENT) 1739 - btrfs_err(fs_info, 1740 - "cleaner transaction attach returned %ld", 1741 - PTR_ERR(trans)); 1742 - } else { 1743 - int ret; 1744 - 1745 - ret = btrfs_commit_transaction(trans); 1746 - if (ret) 1747 - btrfs_err(fs_info, 1748 - "cleaner open transaction commit returned %d", 1749 - ret); 1750 1727 } 1751 - 1752 - return 0; 1753 1728 } 1754 1729 1755 1730 static int transaction_kthread(void *arg) ··· 3904 3931 int ret; 3905 3932 3906 3933 set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); 3934 + /* 3935 + * We don't want the cleaner to start new transactions, add more delayed 3936 + * iputs, etc. while we're closing. We can't use kthread_stop() yet 3937 + * because that frees the task_struct, and the transaction kthread might 3938 + * still try to wake up the cleaner. 3939 + */ 3940 + kthread_park(fs_info->cleaner_kthread); 3907 3941 3908 3942 /* wait for the qgroup rescan worker to stop */ 3909 3943 btrfs_qgroup_wait_for_completion(fs_info, false); ··· 3938 3958 3939 3959 if (!sb_rdonly(fs_info->sb)) { 3940 3960 /* 3941 - * If the cleaner thread is stopped and there are 3942 - * block groups queued for removal, the deletion will be 3943 - * skipped when we quit the cleaner thread. 3961 + * The cleaner kthread is stopped, so do one final pass over 3962 + * unused block groups. 3944 3963 */ 3945 3964 btrfs_delete_unused_bgs(fs_info); 3946 3965 ··· 4338 4359 unpin = pinned_extents; 4339 4360 again: 4340 4361 while (1) { 4362 + /* 4363 + * The btrfs_finish_extent_commit() may get the same range as 4364 + * ours between find_first_extent_bit and clear_extent_dirty. 4365 + * Hence, hold the unused_bg_unpin_mutex to avoid double unpin 4366 + * the same extent range. 4367 + */ 4368 + mutex_lock(&fs_info->unused_bg_unpin_mutex); 4341 4369 ret = find_first_extent_bit(unpin, 0, &start, &end, 4342 4370 EXTENT_DIRTY, NULL); 4343 - if (ret) 4371 + if (ret) { 4372 + mutex_unlock(&fs_info->unused_bg_unpin_mutex); 4344 4373 break; 4374 + } 4345 4375 4346 4376 clear_extent_dirty(unpin, start, end); 4347 4377 btrfs_error_unpin_extent_range(fs_info, start, end); 4378 + mutex_unlock(&fs_info->unused_bg_unpin_mutex); 4348 4379 cond_resched(); 4349 4380 } 4350 4381

+21 -1

fs/btrfs/free-space-cache.c

··· 75 75 * sure NOFS is set to keep us from deadlocking. 76 76 */ 77 77 nofs_flag = memalloc_nofs_save(); 78 - inode = btrfs_iget(fs_info->sb, &location, root, NULL); 78 + inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path); 79 + btrfs_release_path(path); 79 80 memalloc_nofs_restore(nofs_flag); 80 81 if (IS_ERR(inode)) 81 82 return inode; ··· 839 838 path->search_commit_root = 1; 840 839 path->skip_locking = 1; 841 840 841 + /* 842 + * We must pass a path with search_commit_root set to btrfs_iget in 843 + * order to avoid a deadlock when allocating extents for the tree root. 844 + * 845 + * When we are COWing an extent buffer from the tree root, when looking 846 + * for a free extent, at extent-tree.c:find_free_extent(), we can find 847 + * block group without its free space cache loaded. When we find one 848 + * we must load its space cache which requires reading its free space 849 + * cache's inode item from the root tree. If this inode item is located 850 + * in the same leaf that we started COWing before, then we end up in 851 + * deadlock on the extent buffer (trying to read lock it when we 852 + * previously write locked it). 853 + * 854 + * It's safe to read the inode item using the commit root because 855 + * block groups, once loaded, stay in memory forever (until they are 856 + * removed) as well as their space caches once loaded. New block groups 857 + * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 858 + * we will never try to read their inode item while the fs is mounted. 859 + */ 842 860 inode = lookup_free_space_inode(fs_info, block_group, path); 843 861 if (IS_ERR(inode)) { 844 862 btrfs_free_path(path);

+24 -13

fs/btrfs/inode.c

··· 1531 1531 } 1532 1532 btrfs_release_path(path); 1533 1533 1534 - if (cur_offset <= end && cow_start == (u64)-1) { 1534 + if (cur_offset <= end && cow_start == (u64)-1) 1535 1535 cow_start = cur_offset; 1536 - cur_offset = end; 1537 - } 1538 1536 1539 1537 if (cow_start != (u64)-1) { 1538 + cur_offset = end; 1540 1539 ret = cow_file_range(inode, locked_page, cow_start, end, end, 1541 1540 page_started, nr_written, 1, NULL); 1542 1541 if (ret) ··· 3569 3570 /* 3570 3571 * read an inode from the btree into the in-memory inode 3571 3572 */ 3572 - static int btrfs_read_locked_inode(struct inode *inode) 3573 + static int btrfs_read_locked_inode(struct inode *inode, 3574 + struct btrfs_path *in_path) 3573 3575 { 3574 3576 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3575 - struct btrfs_path *path; 3577 + struct btrfs_path *path = in_path; 3576 3578 struct extent_buffer *leaf; 3577 3579 struct btrfs_inode_item *inode_item; 3578 3580 struct btrfs_root *root = BTRFS_I(inode)->root; ··· 3589 3589 if (!ret) 3590 3590 filled = true; 3591 3591 3592 - path = btrfs_alloc_path(); 3593 - if (!path) 3594 - return -ENOMEM; 3592 + if (!path) { 3593 + path = btrfs_alloc_path(); 3594 + if (!path) 3595 + return -ENOMEM; 3596 + } 3595 3597 3596 3598 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3597 3599 3598 3600 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3599 3601 if (ret) { 3600 - btrfs_free_path(path); 3602 + if (path != in_path) 3603 + btrfs_free_path(path); 3601 3604 return ret; 3602 3605 } 3603 3606 ··· 3725 3722 btrfs_ino(BTRFS_I(inode)), 3726 3723 root->root_key.objectid, ret); 3727 3724 } 3728 - btrfs_free_path(path); 3725 + if (path != in_path) 3726 + btrfs_free_path(path); 3729 3727 3730 3728 if (!maybe_acls) 3731 3729 cache_no_acl(inode); ··· 5648 5644 /* Get an inode object given its location and corresponding root. 5649 5645 * Returns in *is_new if the inode was read from disk 5650 5646 */ 5651 - struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5652 - struct btrfs_root *root, int *new) 5647 + struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, 5648 + struct btrfs_root *root, int *new, 5649 + struct btrfs_path *path) 5653 5650 { 5654 5651 struct inode *inode; 5655 5652 ··· 5661 5656 if (inode->i_state & I_NEW) { 5662 5657 int ret; 5663 5658 5664 - ret = btrfs_read_locked_inode(inode); 5659 + ret = btrfs_read_locked_inode(inode, path); 5665 5660 if (!ret) { 5666 5661 inode_tree_add(inode); 5667 5662 unlock_new_inode(inode); ··· 5681 5676 } 5682 5677 5683 5678 return inode; 5679 + } 5680 + 5681 + struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 5682 + struct btrfs_root *root, int *new) 5683 + { 5684 + return btrfs_iget_path(s, location, root, new, NULL); 5684 5685 } 5685 5686 5686 5687 static struct inode *new_simple_dir(struct super_block *s,

+12 -2

fs/btrfs/ioctl.c

··· 3488 3488 const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; 3489 3489 3490 3490 len = round_down(i_size_read(src), sz) - loff; 3491 + if (len == 0) 3492 + return 0; 3491 3493 olen = len; 3492 3494 } 3493 3495 } ··· 4259 4257 goto out_unlock; 4260 4258 if (len == 0) 4261 4259 olen = len = src->i_size - off; 4262 - /* if we extend to eof, continue to block boundary */ 4263 - if (off + len == src->i_size) 4260 + /* 4261 + * If we extend to eof, continue to block boundary if and only if the 4262 + * destination end offset matches the destination file's size, otherwise 4263 + * we would be corrupting data by placing the eof block into the middle 4264 + * of a file. 4265 + */ 4266 + if (off + len == src->i_size) { 4267 + if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) 4268 + goto out_unlock; 4264 4269 len = ALIGN(src->i_size, bs) - off; 4270 + } 4265 4271 4266 4272 if (len == 0) { 4267 4273 ret = 0;

+3 -3

fs/btrfs/super.c

··· 1916 1916 } 1917 1917 1918 1918 /* Used to sort the devices by max_avail(descending sort) */ 1919 - static int btrfs_cmp_device_free_bytes(const void *dev_info1, 1919 + static inline int btrfs_cmp_device_free_bytes(const void *dev_info1, 1920 1920 const void *dev_info2) 1921 1921 { 1922 1922 if (((struct btrfs_device_info *)dev_info1)->max_avail > ··· 1945 1945 * The helper to calc the free space on the devices that can be used to store 1946 1946 * file data. 1947 1947 */ 1948 - static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, 1949 - u64 *free_bytes) 1948 + static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, 1949 + u64 *free_bytes) 1950 1950 { 1951 1951 struct btrfs_device_info *devices_info; 1952 1952 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;

+1 -1

fs/btrfs/tree-checker.c

··· 440 440 type != (BTRFS_BLOCK_GROUP_METADATA | 441 441 BTRFS_BLOCK_GROUP_DATA)) { 442 442 block_group_err(fs_info, leaf, slot, 443 - "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx", 443 + "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", 444 444 type, hweight64(type), 445 445 BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, 446 446 BTRFS_BLOCK_GROUP_SYSTEM,

+17

fs/btrfs/tree-log.c

··· 4396 4396 logged_end = end; 4397 4397 4398 4398 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4399 + /* 4400 + * Skip extents outside our logging range. It's important to do 4401 + * it for correctness because if we don't ignore them, we may 4402 + * log them before their ordered extent completes, and therefore 4403 + * we could log them without logging their respective checksums 4404 + * (the checksum items are added to the csum tree at the very 4405 + * end of btrfs_finish_ordered_io()). Also leave such extents 4406 + * outside of our range in the list, since we may have another 4407 + * ranged fsync in the near future that needs them. If an extent 4408 + * outside our range corresponds to a hole, log it to avoid 4409 + * leaving gaps between extents (fsck will complain when we are 4410 + * not using the NO_HOLES feature). 4411 + */ 4412 + if ((em->start > end || em->start + em->len <= start) && 4413 + em->block_start != EXTENT_MAP_HOLE) 4414 + continue; 4415 + 4399 4416 list_del_init(&em->list); 4400 4417 /* 4401 4418 * Just an arbitrary number, this can be really CPU intensive

Configure Feed

Configure Feed