Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
Btrfs: fix data space leak fix
Btrfs: remove duplicates of filemap_ helpers
Btrfs: take i_mutex before generic_write_checks
Btrfs: fix arguments to btrfs_wait_on_page_writeback_range
Btrfs: fix deadlock with free space handling and user transactions
Btrfs: fix error cases for ioctl transactions
Btrfs: Use CONFIG_BTRFS_POSIX_ACL to enable ACL code
Btrfs: introduce missing kfree
Btrfs: Fix setting umask when POSIX ACLs are not enabled
Btrfs: proper -ENOSPC handling

+749 -249
+3 -3
fs/btrfs/acl.c
··· 27 27 #include "btrfs_inode.h" 28 28 #include "xattr.h" 29 29 30 - #ifdef CONFIG_FS_POSIX_ACL 30 + #ifdef CONFIG_BTRFS_POSIX_ACL 31 31 32 32 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 33 33 { ··· 313 313 .set = btrfs_xattr_acl_access_set, 314 314 }; 315 315 316 - #else /* CONFIG_FS_POSIX_ACL */ 316 + #else /* CONFIG_BTRFS_POSIX_ACL */ 317 317 318 318 int btrfs_acl_chmod(struct inode *inode) 319 319 { ··· 325 325 return 0; 326 326 } 327 327 328 - #endif /* CONFIG_FS_POSIX_ACL */ 328 + #endif /* CONFIG_BTRFS_POSIX_ACL */
+8
fs/btrfs/btrfs_inode.h
··· 128 128 u64 last_unlink_trans; 129 129 130 130 /* 131 + * These two counters are for delalloc metadata reservations. We keep 132 + * track of how many extents we've accounted for vs how many extents we 133 + * have. 134 + */ 135 + int delalloc_reserved_extents; 136 + int delalloc_extents; 137 + 138 + /* 131 139 * ordered_data_close is set by truncate when a file that used 132 140 * to have good data has been truncated to zero. When it is set 133 141 * the btrfs file release call will add this inode to the
+17 -8
fs/btrfs/ctree.h
··· 675 675 current allocations */ 676 676 u64 bytes_readonly; /* total bytes that are read only */ 677 677 u64 bytes_super; /* total bytes reserved for the super blocks */ 678 - 679 - /* delalloc accounting */ 680 - u64 bytes_delalloc; /* number of bytes reserved for allocation, 681 - this space is not necessarily reserved yet 682 - by the allocator */ 678 + u64 bytes_root; /* the number of bytes needed to commit a 679 + transaction */ 683 680 u64 bytes_may_use; /* number of bytes that may be used for 684 - delalloc */ 681 + delalloc/allocations */ 682 + u64 bytes_delalloc; /* number of bytes currently reserved for 683 + delayed allocation */ 685 684 686 685 int full; /* indicates that we cannot allocate any more 687 686 chunks for this space */ 688 687 int force_alloc; /* set if we need to force a chunk alloc for 689 688 this space */ 689 + int force_delalloc; /* make people start doing filemap_flush until 690 + we're under a threshold */ 690 691 691 692 struct list_head list; 692 693 ··· 696 695 spinlock_t lock; 697 696 struct rw_semaphore groups_sem; 698 697 atomic_t caching_threads; 698 + 699 + int allocating_chunk; 700 + wait_queue_head_t wait; 699 701 }; 700 702 701 703 /* ··· 2026 2022 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2027 2023 void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2028 2024 2029 - int btrfs_check_metadata_free_space(struct btrfs_root *root); 2025 + int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2026 + int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2027 + int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2028 + struct inode *inode, int num_items); 2029 + int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2030 + struct inode *inode, int num_items); 2030 2031 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2031 2032 u64 bytes); 2032 2033 void btrfs_free_reserved_data_space(struct btrfs_root *root, ··· 2366 2357 int btrfs_sync_fs(struct super_block *sb, int wait); 2367 2358 2368 2359 /* acl.c */ 2369 - #ifdef CONFIG_FS_POSIX_ACL 2360 + #ifdef CONFIG_BTRFS_POSIX_ACL 2370 2361 int btrfs_check_acl(struct inode *inode, int mask); 2371 2362 #else 2372 2363 #define btrfs_check_acl NULL
+5 -5
fs/btrfs/disk-io.c
··· 822 822 823 823 int btrfs_write_tree_block(struct extent_buffer *buf) 824 824 { 825 - return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 826 - buf->start + buf->len - 1, WB_SYNC_ALL); 825 + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, 826 + buf->start + buf->len - 1); 827 827 } 828 828 829 829 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 830 830 { 831 - return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 832 - buf->start, buf->start + buf->len - 1); 831 + return filemap_fdatawait_range(buf->first_page->mapping, 832 + buf->start, buf->start + buf->len - 1); 833 833 } 834 834 835 835 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, ··· 1630 1630 fs_info->sb = sb; 1631 1631 fs_info->max_extent = (u64)-1; 1632 1632 fs_info->max_inline = 8192 * 1024; 1633 - fs_info->metadata_ratio = 8; 1633 + fs_info->metadata_ratio = 0; 1634 1634 1635 1635 fs_info->thread_pool_size = min_t(unsigned long, 1636 1636 num_online_cpus() + 2, 8);
+345 -52
fs/btrfs/extent-tree.c
··· 68 68 struct extent_buffer **must_clean); 69 69 static int find_next_key(struct btrfs_path *path, int level, 70 70 struct btrfs_key *key); 71 + static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 72 + int dump_block_groups); 71 73 72 74 static noinline int 73 75 block_group_cache_done(struct btrfs_block_group_cache *cache) ··· 2767 2765 alloc_target); 2768 2766 } 2769 2767 2768 + static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) 2769 + { 2770 + u64 num_bytes; 2771 + int level; 2772 + 2773 + level = BTRFS_MAX_LEVEL - 2; 2774 + /* 2775 + * NOTE: these calculations are absolutely the worst possible case. 2776 + * This assumes that _every_ item we insert will require a new leaf, and 2777 + * that the tree has grown to its maximum level size. 2778 + */ 2779 + 2780 + /* 2781 + * for every item we insert we could insert both an extent item and a 2782 + * extent ref item. Then for ever item we insert, we will need to cow 2783 + * both the original leaf, plus the leaf to the left and right of it. 2784 + * 2785 + * Unless we are talking about the extent root, then we just want the 2786 + * number of items * 2, since we just need the extent item plus its ref. 2787 + */ 2788 + if (root == root->fs_info->extent_root) 2789 + num_bytes = num_items * 2; 2790 + else 2791 + num_bytes = (num_items + (2 * num_items)) * 3; 2792 + 2793 + /* 2794 + * num_bytes is total number of leaves we could need times the leaf 2795 + * size, and then for every leaf we could end up cow'ing 2 nodes per 2796 + * level, down to the leaf level. 2797 + */ 2798 + num_bytes = (num_bytes * root->leafsize) + 2799 + (num_bytes * (level * 2)) * root->nodesize; 2800 + 2801 + return num_bytes; 2802 + } 2803 + 2770 2804 /* 2771 - * for now this just makes sure we have at least 5% of our metadata space free 2772 - * for use. 2805 + * Unreserve metadata space for delalloc. If we have less reserved credits than 2806 + * we have extents, this function does nothing. 2773 2807 */ 2774 - int btrfs_check_metadata_free_space(struct btrfs_root *root) 2808 + int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2809 + struct inode *inode, int num_items) 2775 2810 { 2776 2811 struct btrfs_fs_info *info = root->fs_info; 2777 2812 struct btrfs_space_info *meta_sinfo; 2778 - u64 alloc_target, thresh; 2779 - int committed = 0, ret; 2813 + u64 num_bytes; 2814 + u64 alloc_target; 2815 + bool bug = false; 2780 2816 2781 2817 /* get the space info for where the metadata will live */ 2782 2818 alloc_target = btrfs_get_alloc_profile(root, 0); 2783 2819 meta_sinfo = __find_space_info(info, alloc_target); 2784 - if (!meta_sinfo) 2785 - goto alloc; 2786 2820 2821 + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 2822 + num_items); 2823 + 2824 + spin_lock(&meta_sinfo->lock); 2825 + if (BTRFS_I(inode)->delalloc_reserved_extents <= 2826 + BTRFS_I(inode)->delalloc_extents) { 2827 + spin_unlock(&meta_sinfo->lock); 2828 + return 0; 2829 + } 2830 + 2831 + BTRFS_I(inode)->delalloc_reserved_extents--; 2832 + BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); 2833 + 2834 + if (meta_sinfo->bytes_delalloc < num_bytes) { 2835 + bug = true; 2836 + meta_sinfo->bytes_delalloc = 0; 2837 + } else { 2838 + meta_sinfo->bytes_delalloc -= num_bytes; 2839 + } 2840 + spin_unlock(&meta_sinfo->lock); 2841 + 2842 + BUG_ON(bug); 2843 + 2844 + return 0; 2845 + } 2846 + 2847 + static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) 2848 + { 2849 + u64 thresh; 2850 + 2851 + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2852 + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2853 + meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2854 + meta_sinfo->bytes_may_use; 2855 + 2856 + thresh = meta_sinfo->total_bytes - thresh; 2857 + thresh *= 80; 2858 + do_div(thresh, 100); 2859 + if (thresh <= meta_sinfo->bytes_delalloc) 2860 + meta_sinfo->force_delalloc = 1; 2861 + else 2862 + meta_sinfo->force_delalloc = 0; 2863 + } 2864 + 2865 + static int maybe_allocate_chunk(struct btrfs_root *root, 2866 + struct btrfs_space_info *info) 2867 + { 2868 + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 2869 + struct btrfs_trans_handle *trans; 2870 + bool wait = false; 2871 + int ret = 0; 2872 + u64 min_metadata; 2873 + u64 free_space; 2874 + 2875 + free_space = btrfs_super_total_bytes(disk_super); 2876 + /* 2877 + * we allow the metadata to grow to a max of either 5gb or 5% of the 2878 + * space in the volume. 2879 + */ 2880 + min_metadata = min((u64)5 * 1024 * 1024 * 1024, 2881 + div64_u64(free_space * 5, 100)); 2882 + if (info->total_bytes >= min_metadata) { 2883 + spin_unlock(&info->lock); 2884 + return 0; 2885 + } 2886 + 2887 + if (info->full) { 2888 + spin_unlock(&info->lock); 2889 + return 0; 2890 + } 2891 + 2892 + if (!info->allocating_chunk) { 2893 + info->force_alloc = 1; 2894 + info->allocating_chunk = 1; 2895 + init_waitqueue_head(&info->wait); 2896 + } else { 2897 + wait = true; 2898 + } 2899 + 2900 + spin_unlock(&info->lock); 2901 + 2902 + if (wait) { 2903 + wait_event(info->wait, 2904 + !info->allocating_chunk); 2905 + return 1; 2906 + } 2907 + 2908 + trans = btrfs_start_transaction(root, 1); 2909 + if (!trans) { 2910 + ret = -ENOMEM; 2911 + goto out; 2912 + } 2913 + 2914 + ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2915 + 4096 + 2 * 1024 * 1024, 2916 + info->flags, 0); 2917 + btrfs_end_transaction(trans, root); 2918 + if (ret) 2919 + goto out; 2920 + out: 2921 + spin_lock(&info->lock); 2922 + info->allocating_chunk = 0; 2923 + spin_unlock(&info->lock); 2924 + wake_up(&info->wait); 2925 + 2926 + if (ret) 2927 + return 0; 2928 + return 1; 2929 + } 2930 + 2931 + /* 2932 + * Reserve metadata space for delalloc. 2933 + */ 2934 + int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2935 + struct inode *inode, int num_items) 2936 + { 2937 + struct btrfs_fs_info *info = root->fs_info; 2938 + struct btrfs_space_info *meta_sinfo; 2939 + u64 num_bytes; 2940 + u64 used; 2941 + u64 alloc_target; 2942 + int flushed = 0; 2943 + int force_delalloc; 2944 + 2945 + /* get the space info for where the metadata will live */ 2946 + alloc_target = btrfs_get_alloc_profile(root, 0); 2947 + meta_sinfo = __find_space_info(info, alloc_target); 2948 + 2949 + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 2950 + num_items); 2787 2951 again: 2788 2952 spin_lock(&meta_sinfo->lock); 2789 - if (!meta_sinfo->full) 2790 - thresh = meta_sinfo->total_bytes * 80; 2791 - else 2792 - thresh = meta_sinfo->total_bytes * 95; 2793 2953 2794 - do_div(thresh, 100); 2954 + force_delalloc = meta_sinfo->force_delalloc; 2795 2955 2796 - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2797 - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2798 - meta_sinfo->bytes_super > thresh) { 2799 - struct btrfs_trans_handle *trans; 2800 - if (!meta_sinfo->full) { 2801 - meta_sinfo->force_alloc = 1; 2956 + if (unlikely(!meta_sinfo->bytes_root)) 2957 + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 2958 + 2959 + if (!flushed) 2960 + meta_sinfo->bytes_delalloc += num_bytes; 2961 + 2962 + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2963 + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2964 + meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2965 + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 2966 + 2967 + if (used > meta_sinfo->total_bytes) { 2968 + flushed++; 2969 + 2970 + if (flushed == 1) { 2971 + if (maybe_allocate_chunk(root, meta_sinfo)) 2972 + goto again; 2973 + flushed++; 2974 + } else { 2802 2975 spin_unlock(&meta_sinfo->lock); 2803 - alloc: 2804 - trans = btrfs_start_transaction(root, 1); 2805 - if (!trans) 2806 - return -ENOMEM; 2976 + } 2807 2977 2808 - ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2809 - 2 * 1024 * 1024, alloc_target, 0); 2810 - btrfs_end_transaction(trans, root); 2811 - if (!meta_sinfo) { 2812 - meta_sinfo = __find_space_info(info, 2813 - alloc_target); 2814 - } 2978 + if (flushed == 2) { 2979 + filemap_flush(inode->i_mapping); 2980 + goto again; 2981 + } else if (flushed == 3) { 2982 + btrfs_start_delalloc_inodes(root); 2983 + btrfs_wait_ordered_extents(root, 0); 2815 2984 goto again; 2816 2985 } 2986 + spin_lock(&meta_sinfo->lock); 2987 + meta_sinfo->bytes_delalloc -= num_bytes; 2817 2988 spin_unlock(&meta_sinfo->lock); 2818 - 2819 - if (!committed) { 2820 - committed = 1; 2821 - trans = btrfs_join_transaction(root, 1); 2822 - if (!trans) 2823 - return -ENOMEM; 2824 - ret = btrfs_commit_transaction(trans, root); 2825 - if (ret) 2826 - return ret; 2827 - goto again; 2828 - } 2989 + printk(KERN_ERR "enospc, has %d, reserved %d\n", 2990 + BTRFS_I(inode)->delalloc_extents, 2991 + BTRFS_I(inode)->delalloc_reserved_extents); 2992 + dump_space_info(meta_sinfo, 0, 0); 2829 2993 return -ENOSPC; 2830 2994 } 2995 + 2996 + BTRFS_I(inode)->delalloc_reserved_extents++; 2997 + check_force_delalloc(meta_sinfo); 2998 + spin_unlock(&meta_sinfo->lock); 2999 + 3000 + if (!flushed && force_delalloc) 3001 + filemap_flush(inode->i_mapping); 3002 + 3003 + return 0; 3004 + } 3005 + 3006 + /* 3007 + * unreserve num_items number of items worth of metadata space. This needs to 3008 + * be paired with btrfs_reserve_metadata_space. 3009 + * 3010 + * NOTE: if you have the option, run this _AFTER_ you do a 3011 + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref 3012 + * oprations which will result in more used metadata, so we want to make sure we 3013 + * can do that without issue. 3014 + */ 3015 + int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) 3016 + { 3017 + struct btrfs_fs_info *info = root->fs_info; 3018 + struct btrfs_space_info *meta_sinfo; 3019 + u64 num_bytes; 3020 + u64 alloc_target; 3021 + bool bug = false; 3022 + 3023 + /* get the space info for where the metadata will live */ 3024 + alloc_target = btrfs_get_alloc_profile(root, 0); 3025 + meta_sinfo = __find_space_info(info, alloc_target); 3026 + 3027 + num_bytes = calculate_bytes_needed(root, num_items); 3028 + 3029 + spin_lock(&meta_sinfo->lock); 3030 + if (meta_sinfo->bytes_may_use < num_bytes) { 3031 + bug = true; 3032 + meta_sinfo->bytes_may_use = 0; 3033 + } else { 3034 + meta_sinfo->bytes_may_use -= num_bytes; 3035 + } 3036 + spin_unlock(&meta_sinfo->lock); 3037 + 3038 + BUG_ON(bug); 3039 + 3040 + return 0; 3041 + } 3042 + 3043 + /* 3044 + * Reserve some metadata space for use. We'll calculate the worste case number 3045 + * of bytes that would be needed to modify num_items number of items. If we 3046 + * have space, fantastic, if not, you get -ENOSPC. Please call 3047 + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of 3048 + * items you reserved, since whatever metadata you needed should have already 3049 + * been allocated. 3050 + * 3051 + * This will commit the transaction to make more space if we don't have enough 3052 + * metadata space. THe only time we don't do this is if we're reserving space 3053 + * inside of a transaction, then we will just return -ENOSPC and it is the 3054 + * callers responsibility to handle it properly. 3055 + */ 3056 + int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) 3057 + { 3058 + struct btrfs_fs_info *info = root->fs_info; 3059 + struct btrfs_space_info *meta_sinfo; 3060 + u64 num_bytes; 3061 + u64 used; 3062 + u64 alloc_target; 3063 + int retries = 0; 3064 + 3065 + /* get the space info for where the metadata will live */ 3066 + alloc_target = btrfs_get_alloc_profile(root, 0); 3067 + meta_sinfo = __find_space_info(info, alloc_target); 3068 + 3069 + num_bytes = calculate_bytes_needed(root, num_items); 3070 + again: 3071 + spin_lock(&meta_sinfo->lock); 3072 + 3073 + if (unlikely(!meta_sinfo->bytes_root)) 3074 + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3075 + 3076 + if (!retries) 3077 + meta_sinfo->bytes_may_use += num_bytes; 3078 + 3079 + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3080 + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3081 + meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3082 + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3083 + 3084 + if (used > meta_sinfo->total_bytes) { 3085 + retries++; 3086 + if (retries == 1) { 3087 + if (maybe_allocate_chunk(root, meta_sinfo)) 3088 + goto again; 3089 + retries++; 3090 + } else { 3091 + spin_unlock(&meta_sinfo->lock); 3092 + } 3093 + 3094 + if (retries == 2) { 3095 + btrfs_start_delalloc_inodes(root); 3096 + btrfs_wait_ordered_extents(root, 0); 3097 + goto again; 3098 + } 3099 + spin_lock(&meta_sinfo->lock); 3100 + meta_sinfo->bytes_may_use -= num_bytes; 3101 + spin_unlock(&meta_sinfo->lock); 3102 + 3103 + dump_space_info(meta_sinfo, 0, 0); 3104 + return -ENOSPC; 3105 + } 3106 + 3107 + check_force_delalloc(meta_sinfo); 2831 3108 spin_unlock(&meta_sinfo->lock); 2832 3109 2833 3110 return 0; ··· 3169 2888 spin_unlock(&data_sinfo->lock); 3170 2889 3171 2890 /* commit the current transaction and try again */ 3172 - if (!committed) { 2891 + if (!committed && !root->fs_info->open_ioctl_trans) { 3173 2892 committed = 1; 3174 2893 trans = btrfs_join_transaction(root, 1); 3175 2894 if (!trans) ··· 3197 2916 BTRFS_I(inode)->reserved_bytes += bytes; 3198 2917 spin_unlock(&data_sinfo->lock); 3199 2918 3200 - return btrfs_check_metadata_free_space(root); 2919 + return 0; 3201 2920 } 3202 2921 3203 2922 /* ··· 3296 3015 BUG_ON(!space_info); 3297 3016 3298 3017 spin_lock(&space_info->lock); 3299 - if (space_info->force_alloc) { 3018 + if (space_info->force_alloc) 3300 3019 force = 1; 3301 - space_info->force_alloc = 0; 3302 - } 3303 3020 if (space_info->full) { 3304 3021 spin_unlock(&space_info->lock); 3305 3022 goto out; 3306 3023 } 3307 3024 3308 3025 thresh = space_info->total_bytes - space_info->bytes_readonly; 3309 - thresh = div_factor(thresh, 6); 3026 + thresh = div_factor(thresh, 8); 3310 3027 if (!force && 3311 3028 (space_info->bytes_used + space_info->bytes_pinned + 3312 3029 space_info->bytes_reserved + alloc_bytes) < thresh) { ··· 3318 3039 * we keep a reasonable number of metadata chunks allocated in the 3319 3040 * FS as well. 3320 3041 */ 3321 - if (flags & BTRFS_BLOCK_GROUP_DATA) { 3042 + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3322 3043 fs_info->data_chunk_allocations++; 3323 3044 if (!(fs_info->data_chunk_allocations % 3324 3045 fs_info->metadata_ratio)) ··· 3326 3047 } 3327 3048 3328 3049 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3050 + spin_lock(&space_info->lock); 3329 3051 if (ret) 3330 3052 space_info->full = 1; 3053 + space_info->force_alloc = 0; 3054 + spin_unlock(&space_info->lock); 3331 3055 out: 3332 3056 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3333 3057 return ret; ··· 4345 4063 return ret; 4346 4064 } 4347 4065 4348 - static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4066 + static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 4067 + int dump_block_groups) 4349 4068 { 4350 4069 struct btrfs_block_group_cache *cache; 4351 4070 4071 + spin_lock(&info->lock); 4352 4072 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4353 4073 (unsigned long long)(info->total_bytes - info->bytes_used - 4354 - info->bytes_pinned - info->bytes_reserved), 4074 + info->bytes_pinned - info->bytes_reserved - 4075 + info->bytes_super), 4355 4076 (info->full) ? "" : "not "); 4356 4077 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4357 - " may_use=%llu, used=%llu\n", 4078 + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4079 + "\n", 4358 4080 (unsigned long long)info->total_bytes, 4359 4081 (unsigned long long)info->bytes_pinned, 4360 4082 (unsigned long long)info->bytes_delalloc, 4361 4083 (unsigned long long)info->bytes_may_use, 4362 - (unsigned long long)info->bytes_used); 4084 + (unsigned long long)info->bytes_used, 4085 + (unsigned long long)info->bytes_root, 4086 + (unsigned long long)info->bytes_super, 4087 + (unsigned long long)info->bytes_reserved); 4088 + spin_unlock(&info->lock); 4089 + 4090 + if (!dump_block_groups) 4091 + return; 4363 4092 4364 4093 down_read(&info->groups_sem); 4365 4094 list_for_each_entry(cache, &info->block_groups, list) { ··· 4438 4145 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4439 4146 "wanted %llu\n", (unsigned long long)data, 4440 4147 (unsigned long long)num_bytes); 4441 - dump_space_info(sinfo, num_bytes); 4148 + dump_space_info(sinfo, num_bytes, 1); 4442 4149 } 4443 4150 4444 4151 return ret;
+69 -23
fs/btrfs/extent_io.c
··· 280 280 return NULL; 281 281 } 282 282 283 + static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 284 + struct extent_state *other) 285 + { 286 + if (tree->ops && tree->ops->merge_extent_hook) 287 + tree->ops->merge_extent_hook(tree->mapping->host, new, 288 + other); 289 + } 290 + 283 291 /* 284 292 * utility function to look for merge candidates inside a given range. 285 293 * Any extents with matching state are merged together into a single ··· 311 303 other = rb_entry(other_node, struct extent_state, rb_node); 312 304 if (other->end == state->start - 1 && 313 305 other->state == state->state) { 306 + merge_cb(tree, state, other); 314 307 state->start = other->start; 315 308 other->tree = NULL; 316 309 rb_erase(&other->rb_node, &tree->state); ··· 323 314 other = rb_entry(other_node, struct extent_state, rb_node); 324 315 if (other->start == state->end + 1 && 325 316 other->state == state->state) { 317 + merge_cb(tree, state, other); 326 318 other->start = state->start; 327 319 state->tree = NULL; 328 320 rb_erase(&state->rb_node, &tree->state); 329 321 free_extent_state(state); 322 + state = NULL; 330 323 } 331 324 } 325 + 332 326 return 0; 333 327 } 334 328 335 - static void set_state_cb(struct extent_io_tree *tree, 329 + static int set_state_cb(struct extent_io_tree *tree, 336 330 struct extent_state *state, 337 331 unsigned long bits) 338 332 { 339 333 if (tree->ops && tree->ops->set_bit_hook) { 340 - tree->ops->set_bit_hook(tree->mapping->host, state->start, 341 - state->end, state->state, bits); 334 + return tree->ops->set_bit_hook(tree->mapping->host, 335 + state->start, state->end, 336 + state->state, bits); 342 337 } 338 + 339 + return 0; 343 340 } 344 341 345 342 static void clear_state_cb(struct extent_io_tree *tree, 346 343 struct extent_state *state, 347 344 unsigned long bits) 348 345 { 349 - if (tree->ops && tree->ops->clear_bit_hook) { 350 - tree->ops->clear_bit_hook(tree->mapping->host, state->start, 351 - state->end, state->state, bits); 352 - } 346 + if (tree->ops && tree->ops->clear_bit_hook) 347 + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 353 348 } 354 349 355 350 /* ··· 371 358 int bits) 372 359 { 373 360 struct rb_node *node; 361 + int ret; 374 362 375 363 if (end < start) { 376 364 printk(KERN_ERR "btrfs end < start %llu %llu\n", ··· 379 365 (unsigned long long)start); 380 366 WARN_ON(1); 381 367 } 382 - if (bits & EXTENT_DIRTY) 383 - tree->dirty_bytes += end - start + 1; 384 368 state->start = start; 385 369 state->end = end; 386 - set_state_cb(tree, state, bits); 370 + ret = set_state_cb(tree, state, bits); 371 + if (ret) 372 + return ret; 373 + 374 + if (bits & EXTENT_DIRTY) 375 + tree->dirty_bytes += end - start + 1; 387 376 state->state |= bits; 388 377 node = tree_insert(&tree->state, end, &state->rb_node); 389 378 if (node) { ··· 401 384 } 402 385 state->tree = tree; 403 386 merge_state(tree, state); 387 + return 0; 388 + } 389 + 390 + static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 391 + u64 split) 392 + { 393 + if (tree->ops && tree->ops->split_extent_hook) 394 + return tree->ops->split_extent_hook(tree->mapping->host, 395 + orig, split); 404 396 return 0; 405 397 } 406 398 ··· 431 405 struct extent_state *prealloc, u64 split) 432 406 { 433 407 struct rb_node *node; 408 + 409 + split_cb(tree, orig, split); 410 + 434 411 prealloc->start = orig->start; 435 412 prealloc->end = split - 1; 436 413 prealloc->state = orig->state; ··· 571 542 if (err) 572 543 goto out; 573 544 if (state->end <= end) { 574 - set |= clear_state_bit(tree, state, bits, 575 - wake, delete); 545 + set |= clear_state_bit(tree, state, bits, wake, 546 + delete); 576 547 if (last_end == (u64)-1) 577 548 goto out; 578 549 start = last_end + 1; ··· 590 561 prealloc = alloc_extent_state(GFP_ATOMIC); 591 562 err = split_state(tree, state, prealloc, end + 1); 592 563 BUG_ON(err == -EEXIST); 593 - 594 564 if (wake) 595 565 wake_up(&state->wq); 596 566 597 - set |= clear_state_bit(tree, prealloc, bits, 598 - wake, delete); 567 + set |= clear_state_bit(tree, prealloc, bits, wake, delete); 568 + 599 569 prealloc = NULL; 600 570 goto out; 601 571 } ··· 695 667 return 0; 696 668 } 697 669 698 - static void set_state_bits(struct extent_io_tree *tree, 670 + static int set_state_bits(struct extent_io_tree *tree, 699 671 struct extent_state *state, 700 672 int bits) 701 673 { 674 + int ret; 675 + 676 + ret = set_state_cb(tree, state, bits); 677 + if (ret) 678 + return ret; 679 + 702 680 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 703 681 u64 range = state->end - state->start + 1; 704 682 tree->dirty_bytes += range; 705 683 } 706 - set_state_cb(tree, state, bits); 707 684 state->state |= bits; 685 + 686 + return 0; 708 687 } 709 688 710 689 static void cache_state(struct extent_state *state, ··· 793 758 goto out; 794 759 } 795 760 796 - set_state_bits(tree, state, bits); 761 + err = set_state_bits(tree, state, bits); 762 + if (err) 763 + goto out; 764 + 797 765 cache_state(state, cached_state); 798 766 merge_state(tree, state); 799 767 if (last_end == (u64)-1) ··· 843 805 if (err) 844 806 goto out; 845 807 if (state->end <= end) { 846 - set_state_bits(tree, state, bits); 808 + err = set_state_bits(tree, state, bits); 809 + if (err) 810 + goto out; 847 811 cache_state(state, cached_state); 848 812 merge_state(tree, state); 849 813 if (last_end == (u64)-1) ··· 869 829 this_end = last_start - 1; 870 830 err = insert_state(tree, prealloc, start, this_end, 871 831 bits); 832 + BUG_ON(err == -EEXIST); 833 + if (err) { 834 + prealloc = NULL; 835 + goto out; 836 + } 872 837 cache_state(prealloc, cached_state); 873 838 prealloc = NULL; 874 - BUG_ON(err == -EEXIST); 875 - if (err) 876 - goto out; 877 839 start = this_end + 1; 878 840 goto search_again; 879 841 } ··· 894 852 err = split_state(tree, state, prealloc, end + 1); 895 853 BUG_ON(err == -EEXIST); 896 854 897 - set_state_bits(tree, prealloc, bits); 855 + err = set_state_bits(tree, prealloc, bits); 856 + if (err) { 857 + prealloc = NULL; 858 + goto out; 859 + } 898 860 cache_state(prealloc, cached_state); 899 861 merge_state(tree, prealloc); 900 862 prealloc = NULL;
+11 -2
fs/btrfs/extent_io.h
··· 60 60 struct extent_state *state, int uptodate); 61 61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 62 62 unsigned long old, unsigned long bits); 63 - int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 64 - unsigned long old, unsigned long bits); 63 + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 64 + unsigned long bits); 65 + int (*merge_extent_hook)(struct inode *inode, 66 + struct extent_state *new, 67 + struct extent_state *other); 68 + int (*split_extent_hook)(struct inode *inode, 69 + struct extent_state *orig, u64 split); 65 70 int (*write_cache_pages_lock_hook)(struct page *page); 66 71 }; 67 72 ··· 84 79 u64 start; 85 80 u64 end; /* inclusive */ 86 81 struct rb_node rb_node; 82 + 83 + /* ADD NEW ELEMENTS AFTER THIS */ 87 84 struct extent_io_tree *tree; 88 85 wait_queue_head_t wq; 89 86 atomic_t refs; 90 87 unsigned long state; 88 + u64 split_start; 89 + u64 split_end; 91 90 92 91 /* for use by the FS */ 93 92 u64 private;
+25 -8
fs/btrfs/file.c
··· 123 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 124 124 125 125 end_of_last_block = start_pos + num_bytes - 1; 126 - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 127 + if (err) 128 + return err; 129 + 127 130 for (i = 0; i < num_pages; i++) { 128 131 struct page *p = pages[i]; 129 132 SetPageUptodate(p); ··· 920 917 start_pos = pos; 921 918 922 919 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 920 + 921 + /* do the reserve before the mutex lock in case we have to do some 922 + * flushing. We wouldn't deadlock, but this is more polite. 923 + */ 924 + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); 925 + if (err) 926 + goto out_nolock; 927 + 928 + mutex_lock(&inode->i_mutex); 929 + 923 930 current->backing_dev_info = inode->i_mapping->backing_dev_info; 924 931 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 925 932 if (err) 926 - goto out_nolock; 933 + goto out; 934 + 927 935 if (count == 0) 928 - goto out_nolock; 936 + goto out; 929 937 930 938 err = file_remove_suid(file); 931 939 if (err) 932 - goto out_nolock; 940 + goto out; 941 + 933 942 file_update_time(file); 934 943 935 944 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 936 945 937 - mutex_lock(&inode->i_mutex); 946 + /* generic_write_checks can change our pos */ 947 + start_pos = pos; 948 + 938 949 BTRFS_I(inode)->sequence++; 939 950 first_index = pos >> PAGE_CACHE_SHIFT; 940 951 last_index = (pos + count) >> PAGE_CACHE_SHIFT; ··· 1022 1005 } 1023 1006 1024 1007 if (will_write) { 1025 - btrfs_fdatawrite_range(inode->i_mapping, pos, 1026 - pos + write_bytes - 1, 1027 - WB_SYNC_ALL); 1008 + filemap_fdatawrite_range(inode->i_mapping, pos, 1009 + pos + write_bytes - 1); 1028 1010 } else { 1029 1011 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1030 1012 num_pages); ··· 1044 1028 mutex_unlock(&inode->i_mutex); 1045 1029 if (ret) 1046 1030 err = ret; 1031 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1047 1032 1048 1033 out_nolock: 1049 1034 kfree(pages);
+204 -31
fs/btrfs/inode.c
··· 1159 1159 return ret; 1160 1160 } 1161 1161 1162 + static int btrfs_split_extent_hook(struct inode *inode, 1163 + struct extent_state *orig, u64 split) 1164 + { 1165 + struct btrfs_root *root = BTRFS_I(inode)->root; 1166 + u64 size; 1167 + 1168 + if (!(orig->state & EXTENT_DELALLOC)) 1169 + return 0; 1170 + 1171 + size = orig->end - orig->start + 1; 1172 + if (size > root->fs_info->max_extent) { 1173 + u64 num_extents; 1174 + u64 new_size; 1175 + 1176 + new_size = orig->end - split + 1; 1177 + num_extents = div64_u64(size + root->fs_info->max_extent - 1, 1178 + root->fs_info->max_extent); 1179 + 1180 + /* 1181 + * if we break a large extent up then leave delalloc_extents be, 1182 + * since we've already accounted for the large extent. 1183 + */ 1184 + if (div64_u64(new_size + root->fs_info->max_extent - 1, 1185 + root->fs_info->max_extent) < num_extents) 1186 + return 0; 1187 + } 1188 + 1189 + BTRFS_I(inode)->delalloc_extents++; 1190 + 1191 + return 0; 1192 + } 1193 + 1194 + /* 1195 + * extent_io.c merge_extent_hook, used to track merged delayed allocation 1196 + * extents so we can keep track of new extents that are just merged onto old 1197 + * extents, such as when we are doing sequential writes, so we can properly 1198 + * account for the metadata space we'll need. 1199 + */ 1200 + static int btrfs_merge_extent_hook(struct inode *inode, 1201 + struct extent_state *new, 1202 + struct extent_state *other) 1203 + { 1204 + struct btrfs_root *root = BTRFS_I(inode)->root; 1205 + u64 new_size, old_size; 1206 + u64 num_extents; 1207 + 1208 + /* not delalloc, ignore it */ 1209 + if (!(other->state & EXTENT_DELALLOC)) 1210 + return 0; 1211 + 1212 + old_size = other->end - other->start + 1; 1213 + if (new->start < other->start) 1214 + new_size = other->end - new->start + 1; 1215 + else 1216 + new_size = new->end - other->start + 1; 1217 + 1218 + /* we're not bigger than the max, unreserve the space and go */ 1219 + if (new_size <= root->fs_info->max_extent) { 1220 + BTRFS_I(inode)->delalloc_extents--; 1221 + return 0; 1222 + } 1223 + 1224 + /* 1225 + * If we grew by another max_extent, just return, we want to keep that 1226 + * reserved amount. 1227 + */ 1228 + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, 1229 + root->fs_info->max_extent); 1230 + if (div64_u64(new_size + root->fs_info->max_extent - 1, 1231 + root->fs_info->max_extent) > num_extents) 1232 + return 0; 1233 + 1234 + BTRFS_I(inode)->delalloc_extents--; 1235 + 1236 + return 0; 1237 + } 1238 + 1162 1239 /* 1163 1240 * extent_io.c set_bit_hook, used to track delayed allocation 1164 1241 * bytes in this file, and to maintain the list of inodes that ··· 1244 1167 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1245 1168 unsigned long old, unsigned long bits) 1246 1169 { 1170 + 1247 1171 /* 1248 1172 * set_bit and clear bit hooks normally require _irqsave/restore 1249 1173 * but in this case, we are only testeing for the DELALLOC ··· 1252 1174 */ 1253 1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1254 1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1177 + 1178 + BTRFS_I(inode)->delalloc_extents++; 1255 1179 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1256 1180 spin_lock(&root->fs_info->delalloc_lock); 1257 1181 BTRFS_I(inode)->delalloc_bytes += end - start + 1; ··· 1270 1190 /* 1271 1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1272 1192 */ 1273 - static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1274 - unsigned long old, unsigned long bits) 1193 + static int btrfs_clear_bit_hook(struct inode *inode, 1194 + struct extent_state *state, unsigned long bits) 1275 1195 { 1276 1196 /* 1277 1197 * set_bit and clear bit hooks normally require _irqsave/restore 1278 1198 * but in this case, we are only testeing for the DELALLOC 1279 1199 * bit, which is only set or cleared with irqs on 1280 1200 */ 1281 - if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1201 + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1282 1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1283 1203 1204 + BTRFS_I(inode)->delalloc_extents--; 1205 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1206 + 1284 1207 spin_lock(&root->fs_info->delalloc_lock); 1285 - if (end - start + 1 > root->fs_info->delalloc_bytes) { 1208 + if (state->end - state->start + 1 > 1209 + root->fs_info->delalloc_bytes) { 1286 1210 printk(KERN_INFO "btrfs warning: delalloc account " 1287 1211 "%llu %llu\n", 1288 - (unsigned long long)end - start + 1, 1212 + (unsigned long long) 1213 + state->end - state->start + 1, 1289 1214 (unsigned long long) 1290 1215 root->fs_info->delalloc_bytes); 1291 1216 btrfs_delalloc_free_space(root, inode, (u64)-1); ··· 1298 1213 BTRFS_I(inode)->delalloc_bytes = 0; 1299 1214 } else { 1300 1215 btrfs_delalloc_free_space(root, inode, 1301 - end - start + 1); 1302 - root->fs_info->delalloc_bytes -= end - start + 1; 1303 - BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1216 + state->end - 1217 + state->start + 1); 1218 + root->fs_info->delalloc_bytes -= state->end - 1219 + state->start + 1; 1220 + BTRFS_I(inode)->delalloc_bytes -= state->end - 1221 + state->start + 1; 1304 1222 } 1305 1223 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1306 1224 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { ··· 3038 2950 goto again; 3039 2951 } 3040 2952 3041 - btrfs_set_extent_delalloc(inode, page_start, page_end); 2953 + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 2954 + if (ret) { 2955 + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 2956 + goto out_unlock; 2957 + } 2958 + 3042 2959 ret = 0; 3043 2960 if (offset != PAGE_CACHE_SIZE) { 3044 2961 kaddr = kmap(page); ··· 3074 2981 u64 last_byte; 3075 2982 u64 cur_offset; 3076 2983 u64 hole_size; 3077 - int err; 2984 + int err = 0; 3078 2985 3079 2986 if (size <= hole_start) 3080 2987 return 0; 3081 - 3082 - err = btrfs_check_metadata_free_space(root); 3083 - if (err) 3084 - return err; 3085 2988 3086 2989 btrfs_truncate_page(inode->i_mapping, inode->i_size); 3087 2990 ··· 3113 3024 cur_offset, &hint_byte, 1); 3114 3025 if (err) 3115 3026 break; 3027 + 3028 + err = btrfs_reserve_metadata_space(root, 1); 3029 + if (err) 3030 + break; 3031 + 3116 3032 err = btrfs_insert_file_extent(trans, root, 3117 3033 inode->i_ino, cur_offset, 0, 3118 3034 0, hole_size, 0, hole_size, 3119 3035 0, 0, 0); 3120 3036 btrfs_drop_extent_cache(inode, hole_start, 3121 3037 last_byte - 1, 0); 3038 + btrfs_unreserve_metadata_space(root, 1); 3122 3039 } 3123 3040 free_extent_map(em); 3124 3041 cur_offset = last_byte; ··· 4085 3990 if (!new_valid_dev(rdev)) 4086 3991 return -EINVAL; 4087 3992 4088 - err = btrfs_check_metadata_free_space(root); 3993 + /* 3994 + * 2 for inode item and ref 3995 + * 2 for dir items 3996 + * 1 for xattr if selinux is on 3997 + */ 3998 + err = btrfs_reserve_metadata_space(root, 5); 4089 3999 if (err) 4090 - goto fail; 4000 + return err; 4091 4001 4092 4002 trans = btrfs_start_transaction(root, 1); 4003 + if (!trans) 4004 + goto fail; 4093 4005 btrfs_set_trans_block_group(trans, dir); 4094 4006 4095 4007 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); ··· 4134 4032 nr = trans->blocks_used; 4135 4033 btrfs_end_transaction_throttle(trans, root); 4136 4034 fail: 4035 + btrfs_unreserve_metadata_space(root, 5); 4137 4036 if (drop_inode) { 4138 4037 inode_dec_link_count(inode); 4139 4038 iput(inode); ··· 4155 4052 u64 objectid; 4156 4053 u64 index = 0; 4157 4054 4158 - err = btrfs_check_metadata_free_space(root); 4055 + /* 4056 + * 2 for inode item and ref 4057 + * 2 for dir items 4058 + * 1 for xattr if selinux is on 4059 + */ 4060 + err = btrfs_reserve_metadata_space(root, 5); 4159 4061 if (err) 4160 - goto fail; 4062 + return err; 4063 + 4161 4064 trans = btrfs_start_transaction(root, 1); 4065 + if (!trans) 4066 + goto fail; 4162 4067 btrfs_set_trans_block_group(trans, dir); 4163 4068 4164 4069 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); ··· 4207 4096 nr = trans->blocks_used; 4208 4097 btrfs_end_transaction_throttle(trans, root); 4209 4098 fail: 4099 + btrfs_unreserve_metadata_space(root, 5); 4210 4100 if (drop_inode) { 4211 4101 inode_dec_link_count(inode); 4212 4102 iput(inode); ··· 4230 4118 if (inode->i_nlink == 0) 4231 4119 return -ENOENT; 4232 4120 4233 - btrfs_inc_nlink(inode); 4234 - err = btrfs_check_metadata_free_space(root); 4121 + /* 4122 + * 1 item for inode ref 4123 + * 2 items for dir items 4124 + */ 4125 + err = btrfs_reserve_metadata_space(root, 3); 4235 4126 if (err) 4236 - goto fail; 4127 + return err; 4128 + 4129 + btrfs_inc_nlink(inode); 4130 + 4237 4131 err = btrfs_set_inode_index(dir, &index); 4238 4132 if (err) 4239 4133 goto fail; ··· 4263 4145 nr = trans->blocks_used; 4264 4146 btrfs_end_transaction_throttle(trans, root); 4265 4147 fail: 4148 + btrfs_unreserve_metadata_space(root, 3); 4266 4149 if (drop_inode) { 4267 4150 inode_dec_link_count(inode); 4268 4151 iput(inode); ··· 4283 4164 u64 index = 0; 4284 4165 unsigned long nr = 1; 4285 4166 4286 - err = btrfs_check_metadata_free_space(root); 4167 + /* 4168 + * 2 items for inode and ref 4169 + * 2 items for dir items 4170 + * 1 for xattr if selinux is on 4171 + */ 4172 + err = btrfs_reserve_metadata_space(root, 5); 4287 4173 if (err) 4288 - goto out_unlock; 4174 + return err; 4289 4175 4290 4176 trans = btrfs_start_transaction(root, 1); 4291 - btrfs_set_trans_block_group(trans, dir); 4292 - 4293 - if (IS_ERR(trans)) { 4294 - err = PTR_ERR(trans); 4177 + if (!trans) { 4178 + err = -ENOMEM; 4295 4179 goto out_unlock; 4296 4180 } 4181 + btrfs_set_trans_block_group(trans, dir); 4297 4182 4298 4183 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4299 4184 if (err) { ··· 4346 4223 btrfs_end_transaction_throttle(trans, root); 4347 4224 4348 4225 out_unlock: 4226 + btrfs_unreserve_metadata_space(root, 5); 4349 4227 if (drop_on_err) 4350 4228 iput(inode); 4351 4229 btrfs_btree_balance_dirty(root, nr); ··· 4871 4747 goto out; 4872 4748 } 4873 4749 4750 + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); 4751 + if (ret) { 4752 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 4753 + ret = VM_FAULT_SIGBUS; 4754 + goto out; 4755 + } 4756 + 4874 4757 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4875 4758 again: 4876 4759 lock_page(page); ··· 4909 4778 goto again; 4910 4779 } 4911 4780 4912 - btrfs_set_extent_delalloc(inode, page_start, page_end); 4781 + /* 4782 + * XXX - page_mkwrite gets called every time the page is dirtied, even 4783 + * if it was already dirty, so for space accounting reasons we need to 4784 + * clear any delalloc bits for the range we are fixing to save. There 4785 + * is probably a better way to do this, but for now keep consistent with 4786 + * prepare_pages in the normal write path. 4787 + */ 4788 + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 4789 + EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); 4790 + 4791 + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 4792 + if (ret) { 4793 + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4794 + ret = VM_FAULT_SIGBUS; 4795 + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 4796 + goto out_unlock; 4797 + } 4913 4798 ret = 0; 4914 4799 4915 4800 /* page is wholly or partially inside EOF */ ··· 4948 4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 4949 4802 4950 4803 out_unlock: 4804 + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 4951 4805 if (!ret) 4952 4806 return VM_FAULT_LOCKED; 4953 4807 unlock_page(page); ··· 5065 4917 return NULL; 5066 4918 ei->last_trans = 0; 5067 4919 ei->logged_trans = 0; 4920 + ei->delalloc_extents = 0; 4921 + ei->delalloc_reserved_extents = 0; 5068 4922 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5069 4923 INIT_LIST_HEAD(&ei->i_orphan); 5070 4924 INIT_LIST_HEAD(&ei->ordered_operations); ··· 5220 5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5221 5071 return -ENOTEMPTY; 5222 5072 5223 - ret = btrfs_check_metadata_free_space(root); 5073 + /* 5074 + * 2 items for dir items 5075 + * 1 item for orphan entry 5076 + * 1 item for ref 5077 + */ 5078 + ret = btrfs_reserve_metadata_space(root, 4); 5224 5079 if (ret) 5225 5080 return ret; 5226 5081 ··· 5340 5185 5341 5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5342 5187 up_read(&root->fs_info->subvol_sem); 5188 + 5189 + btrfs_unreserve_metadata_space(root, 4); 5343 5190 return ret; 5344 5191 } 5345 5192 ··· 5413 5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5414 5257 return -ENAMETOOLONG; 5415 5258 5416 - err = btrfs_check_metadata_free_space(root); 5259 + /* 5260 + * 2 items for inode item and ref 5261 + * 2 items for dir items 5262 + * 1 item for xattr if selinux is on 5263 + */ 5264 + err = btrfs_reserve_metadata_space(root, 5); 5417 5265 if (err) 5418 - goto out_fail; 5266 + return err; 5419 5267 5420 5268 trans = btrfs_start_transaction(root, 1); 5269 + if (!trans) 5270 + goto out_fail; 5421 5271 btrfs_set_trans_block_group(trans, dir); 5422 5272 5423 5273 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); ··· 5505 5341 nr = trans->blocks_used; 5506 5342 btrfs_end_transaction_throttle(trans, root); 5507 5343 out_fail: 5344 + btrfs_unreserve_metadata_space(root, 5); 5508 5345 if (drop_inode) { 5509 5346 inode_dec_link_count(inode); 5510 5347 iput(inode); ··· 5527 5362 5528 5363 while (num_bytes > 0) { 5529 5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5365 + 5366 + ret = btrfs_reserve_metadata_space(root, 1); 5367 + if (ret) 5368 + goto out; 5369 + 5530 5370 ret = btrfs_reserve_extent(trans, root, alloc_size, 5531 5371 root->sectorsize, 0, alloc_hint, 5532 5372 (u64)-1, &ins, 1); ··· 5551 5381 num_bytes -= ins.offset; 5552 5382 cur_offset += ins.offset; 5553 5383 alloc_hint = ins.objectid + ins.offset; 5384 + btrfs_unreserve_metadata_space(root, 1); 5554 5385 } 5555 5386 out: 5556 5387 if (cur_offset > start) { ··· 5737 5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5738 5567 .set_bit_hook = btrfs_set_bit_hook, 5739 5568 .clear_bit_hook = btrfs_clear_bit_hook, 5569 + .merge_extent_hook = btrfs_merge_extent_hook, 5570 + .split_extent_hook = btrfs_split_extent_hook, 5740 5571 }; 5741 5572 5742 5573 /*
+42 -22
fs/btrfs/ioctl.c
··· 239 239 u64 index = 0; 240 240 unsigned long nr = 1; 241 241 242 - ret = btrfs_check_metadata_free_space(root); 242 + /* 243 + * 1 - inode item 244 + * 2 - refs 245 + * 1 - root item 246 + * 2 - dir items 247 + */ 248 + ret = btrfs_reserve_metadata_space(root, 6); 243 249 if (ret) 244 250 return ret; 245 251 ··· 346 340 err = btrfs_commit_transaction(trans, root); 347 341 if (err && !ret) 348 342 ret = err; 343 + 344 + btrfs_unreserve_metadata_space(root, 6); 345 + btrfs_btree_balance_dirty(root, nr); 349 346 return ret; 350 347 } 351 348 ··· 364 355 if (!root->ref_cows) 365 356 return -EINVAL; 366 357 367 - ret = btrfs_check_metadata_free_space(root); 358 + /* 359 + * 1 - inode item 360 + * 2 - refs 361 + * 1 - root item 362 + * 2 - dir items 363 + */ 364 + ret = btrfs_reserve_metadata_space(root, 6); 368 365 if (ret) 369 366 goto fail_unlock; 370 367 371 368 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 372 369 if (!pending_snapshot) { 373 370 ret = -ENOMEM; 371 + btrfs_unreserve_metadata_space(root, 6); 374 372 goto fail_unlock; 375 373 } 376 374 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 377 375 if (!pending_snapshot->name) { 378 376 ret = -ENOMEM; 379 377 kfree(pending_snapshot); 378 + btrfs_unreserve_metadata_space(root, 6); 380 379 goto fail_unlock; 381 380 } 382 381 memcpy(pending_snapshot->name, name, namelen); ··· 1232 1215 struct inode *inode = fdentry(file)->d_inode; 1233 1216 struct btrfs_root *root = BTRFS_I(inode)->root; 1234 1217 struct btrfs_trans_handle *trans; 1235 - int ret = 0; 1218 + int ret; 1236 1219 1220 + ret = -EPERM; 1237 1221 if (!capable(CAP_SYS_ADMIN)) 1238 - return -EPERM; 1239 - 1240 - if (file->private_data) { 1241 - ret = -EINPROGRESS; 1242 1222 goto out; 1243 - } 1223 + 1224 + ret = -EINPROGRESS; 1225 + if (file->private_data) 1226 + goto out; 1244 1227 1245 1228 ret = mnt_want_write(file->f_path.mnt); 1246 1229 if (ret) ··· 1250 1233 root->fs_info->open_ioctl_trans++; 1251 1234 mutex_unlock(&root->fs_info->trans_mutex); 1252 1235 1236 + ret = -ENOMEM; 1253 1237 trans = btrfs_start_ioctl_transaction(root, 0); 1254 - if (trans) 1255 - file->private_data = trans; 1256 - else 1257 - ret = -ENOMEM; 1258 - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1238 + if (!trans) 1239 + goto out_drop; 1240 + 1241 + file->private_data = trans; 1242 + return 0; 1243 + 1244 + out_drop: 1245 + mutex_lock(&root->fs_info->trans_mutex); 1246 + root->fs_info->open_ioctl_trans--; 1247 + mutex_unlock(&root->fs_info->trans_mutex); 1248 + mnt_drop_write(file->f_path.mnt); 1259 1249 out: 1260 1250 return ret; 1261 1251 } ··· 1278 1254 struct inode *inode = fdentry(file)->d_inode; 1279 1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1280 1256 struct btrfs_trans_handle *trans; 1281 - int ret = 0; 1282 1257 1283 1258 trans = file->private_data; 1284 - if (!trans) { 1285 - ret = -EINVAL; 1286 - goto out; 1287 - } 1288 - btrfs_end_transaction(trans, root); 1259 + if (!trans) 1260 + return -EINVAL; 1289 1261 file->private_data = NULL; 1262 + 1263 + btrfs_end_transaction(trans, root); 1290 1264 1291 1265 mutex_lock(&root->fs_info->trans_mutex); 1292 1266 root->fs_info->open_ioctl_trans--; 1293 1267 mutex_unlock(&root->fs_info->trans_mutex); 1294 1268 1295 1269 mnt_drop_write(file->f_path.mnt); 1296 - 1297 - out: 1298 - return ret; 1270 + return 0; 1299 1271 } 1300 1272 1301 1273 long btrfs_ioctl(struct file *file, unsigned int
+4 -89
fs/btrfs/ordered-data.c
··· 458 458 * start IO on any dirty ones so the wait doesn't stall waiting 459 459 * for pdflush to find them 460 460 */ 461 - btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 461 + filemap_fdatawrite_range(inode->i_mapping, start, end); 462 462 if (wait) { 463 463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 464 464 &entry->flags)); ··· 488 488 /* start IO across the range first to instantiate any delalloc 489 489 * extents 490 490 */ 491 - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 491 + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 492 492 493 493 /* The compression code will leave pages locked but return from 494 494 * writepage without setting the page writeback. Starting again 495 495 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 496 496 */ 497 - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 498 498 499 - btrfs_wait_on_page_writeback_range(inode->i_mapping, 500 - start >> PAGE_CACHE_SHIFT, 501 - orig_end >> PAGE_CACHE_SHIFT); 499 + filemap_fdatawait_range(inode->i_mapping, start, orig_end); 502 500 503 501 end = orig_end; 504 502 found = 0; ··· 713 715 return ret; 714 716 } 715 717 716 - 717 - /** 718 - * taken from mm/filemap.c because it isn't exported 719 - * 720 - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range 721 - * @mapping: address space structure to write 722 - * @start: offset in bytes where the range starts 723 - * @end: offset in bytes where the range ends (inclusive) 724 - * @sync_mode: enable synchronous operation 725 - * 726 - * Start writeback against all of a mapping's dirty pages that lie 727 - * within the byte offsets <start, end> inclusive. 728 - * 729 - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 730 - * opposed to a regular memory cleansing writeback. The difference between 731 - * these two operations is that if a dirty page/buffer is encountered, it must 732 - * be waited upon, and not just skipped over. 733 - */ 734 - int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 735 - loff_t end, int sync_mode) 736 - { 737 - struct writeback_control wbc = { 738 - .sync_mode = sync_mode, 739 - .nr_to_write = mapping->nrpages * 2, 740 - .range_start = start, 741 - .range_end = end, 742 - }; 743 - return btrfs_writepages(mapping, &wbc); 744 - } 745 - 746 - /** 747 - * taken from mm/filemap.c because it isn't exported 748 - * 749 - * wait_on_page_writeback_range - wait for writeback to complete 750 - * @mapping: target address_space 751 - * @start: beginning page index 752 - * @end: ending page index 753 - * 754 - * Wait for writeback to complete against pages indexed by start->end 755 - * inclusive 756 - */ 757 - int btrfs_wait_on_page_writeback_range(struct address_space *mapping, 758 - pgoff_t start, pgoff_t end) 759 - { 760 - struct pagevec pvec; 761 - int nr_pages; 762 - int ret = 0; 763 - pgoff_t index; 764 - 765 - if (end < start) 766 - return 0; 767 - 768 - pagevec_init(&pvec, 0); 769 - index = start; 770 - while ((index <= end) && 771 - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 772 - PAGECACHE_TAG_WRITEBACK, 773 - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 774 - unsigned i; 775 - 776 - for (i = 0; i < nr_pages; i++) { 777 - struct page *page = pvec.pages[i]; 778 - 779 - /* until radix tree lookup accepts end_index */ 780 - if (page->index > end) 781 - continue; 782 - 783 - wait_on_page_writeback(page); 784 - if (PageError(page)) 785 - ret = -EIO; 786 - } 787 - pagevec_release(&pvec); 788 - cond_resched(); 789 - } 790 - 791 - /* Check for outstanding write errors */ 792 - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 793 - ret = -ENOSPC; 794 - if (test_and_clear_bit(AS_EIO, &mapping->flags)) 795 - ret = -EIO; 796 - 797 - return ret; 798 - } 799 718 800 719 /* 801 720 * add a given inode to the list of inodes that must be fully on
-4
fs/btrfs/ordered-data.h
··· 153 153 int btrfs_ordered_update_i_size(struct inode *inode, 154 154 struct btrfs_ordered_extent *ordered); 155 155 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 156 - int btrfs_wait_on_page_writeback_range(struct address_space *mapping, 157 - pgoff_t start, pgoff_t end); 158 - int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, 159 - loff_t end, int sync_mode); 160 156 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 161 157 int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 162 158 int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+2
fs/btrfs/super.c
··· 344 344 sb->s_export_op = &btrfs_export_ops; 345 345 sb->s_xattr = btrfs_xattr_handlers; 346 346 sb->s_time_gran = 1; 347 + #ifdef CONFIG_BTRFS_POSIX_ACL 347 348 sb->s_flags |= MS_POSIXACL; 349 + #endif 348 350 349 351 tree_root = open_ctree(sb, fs_devices, (char *)data); 350 352
+10
fs/btrfs/transaction.c
··· 186 186 h->alloc_exclude_start = 0; 187 187 h->delayed_ref_updates = 0; 188 188 189 + if (!current->journal_info) 190 + current->journal_info = h; 191 + 189 192 root->fs_info->running_transaction->use_count++; 190 193 record_root_in_trans(h, root); 191 194 mutex_unlock(&root->fs_info->trans_mutex); ··· 320 317 wake_up(&cur_trans->writer_wait); 321 318 put_transaction(cur_trans); 322 319 mutex_unlock(&info->trans_mutex); 320 + 321 + if (current->journal_info == trans) 322 + current->journal_info = NULL; 323 323 memset(trans, 0, sizeof(*trans)); 324 324 kmem_cache_free(btrfs_trans_handle_cachep, trans); 325 325 ··· 749 743 memcpy(&pending->root_key, &key, sizeof(key)); 750 744 fail: 751 745 kfree(new_root_item); 746 + btrfs_unreserve_metadata_space(root, 6); 752 747 return ret; 753 748 } 754 749 ··· 1065 1058 put_transaction(cur_trans); 1066 1059 1067 1060 mutex_unlock(&root->fs_info->trans_mutex); 1061 + 1062 + if (current->journal_info == trans) 1063 + current->journal_info = NULL; 1068 1064 1069 1065 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1070 1066 return ret;
+3 -1
fs/btrfs/volumes.c
··· 446 446 goto error; 447 447 448 448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 449 - if (!device->name) 449 + if (!device->name) { 450 + kfree(device); 450 451 goto error; 452 + } 451 453 452 454 device->devid = orig_dev->devid; 453 455 device->work.func = pending_bios_fn;
+1 -1
fs/btrfs/xattr.c
··· 260 260 * attributes are handled directly. 261 261 */ 262 262 struct xattr_handler *btrfs_xattr_handlers[] = { 263 - #ifdef CONFIG_FS_POSIX_ACL 263 + #ifdef CONFIG_BTRFS_POSIX_ACL 264 264 &btrfs_xattr_acl_access_handler, 265 265 &btrfs_xattr_acl_default_handler, 266 266 #endif