Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
"Regression and bug fixes:

- Performance regression fix from 5.18 on a Rasberry Pi

- Fix extent parsing bug which triggers a BUG_ON when a (corrupted)
extent tree has has a non-root node when zero entries.

- Fix a livelock where in the right (wrong) circumstances a large
number of nfsd threads can try to write to a nearly full file
system, and retry for hours(!)"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
ext4: limit the number of retries after discarding preallocations blocks
ext4: fix bug in extents parsing when eh_entries == 0 and eh_depth > 0
ext4: use buckets for cr 1 block scan instead of rbtree
ext4: use locality group preallocation for small closed files
ext4: make directory inode spreading reflect flexbg size
ext4: avoid unnecessary spreading of allocations among groups
ext4: make mballoc try target group first even with mb_optimize_scan

+154 -181
+5 -5
fs/ext4/ext4.h
··· 167 167 #define EXT4_MB_CR0_OPTIMIZED 0x8000 168 168 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ 169 169 #define EXT4_MB_CR1_OPTIMIZED 0x00010000 170 - /* Perform linear traversal for one group */ 171 - #define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 172 170 struct ext4_allocation_request { 173 171 /* target inode for block we're allocating */ 174 172 struct inode *inode; ··· 1598 1600 struct list_head s_discard_list; 1599 1601 struct work_struct s_discard_work; 1600 1602 atomic_t s_retry_alloc_pending; 1601 - struct rb_root s_mb_avg_fragment_size_root; 1602 - rwlock_t s_mb_rb_lock; 1603 + struct list_head *s_mb_avg_fragment_size; 1604 + rwlock_t *s_mb_avg_fragment_size_locks; 1603 1605 struct list_head *s_mb_largest_free_orders; 1604 1606 rwlock_t *s_mb_largest_free_orders_locks; 1605 1607 ··· 3411 3413 ext4_grpblk_t bb_first_free; /* first free block */ 3412 3414 ext4_grpblk_t bb_free; /* total free blocks */ 3413 3415 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 3416 + int bb_avg_fragment_size_order; /* order of average 3417 + fragment in BG */ 3414 3418 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ 3415 3419 ext4_group_t bb_group; /* Group number */ 3416 3420 struct list_head bb_prealloc_list; ··· 3420 3420 void *bb_bitmap; 3421 3421 #endif 3422 3422 struct rw_semaphore alloc_sem; 3423 - struct rb_node bb_avg_fragment_size_rb; 3423 + struct list_head bb_avg_fragment_size_node; 3424 3424 struct list_head bb_largest_free_order_node; 3425 3425 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block 3426 3426 * regions, index is order.
+4
fs/ext4/extents.c
··· 460 460 error_msg = "invalid eh_entries"; 461 461 goto corrupted; 462 462 } 463 + if (unlikely((eh->eh_entries == 0) && (depth > 0))) { 464 + error_msg = "eh_entries is 0 but eh_depth is > 0"; 465 + goto corrupted; 466 + } 463 467 if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { 464 468 error_msg = "invalid extent entries"; 465 469 goto corrupted;
+1 -1
fs/ext4/ialloc.c
··· 510 510 goto fallback; 511 511 } 512 512 513 - max_dirs = ndirs / ngroups + inodes_per_group / 16; 513 + max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; 514 514 min_inodes = avefreei - inodes_per_group*flex_size / 4; 515 515 if (min_inodes < 1) 516 516 min_inodes = 1;
+144 -174
fs/ext4/mballoc.c
··· 140 140 * number of buddy bitmap orders possible) number of lists. Group-infos are 141 141 * placed in appropriate lists. 142 142 * 143 - * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) 143 + * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) 144 144 * 145 - * Locking: sbi->s_mb_rb_lock (rwlock) 145 + * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) 146 146 * 147 - * This is a red black tree consisting of group infos and the tree is sorted 148 - * by average fragment sizes (which is calculated as ext4_group_info->bb_free 149 - * / ext4_group_info->bb_fragments). 147 + * This is an array of lists where in the i-th list there are groups with 148 + * average fragment size >= 2^i and < 2^(i+1). The average fragment size 149 + * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. 150 + * Note that we don't bother with a special list for completely empty groups 151 + * so we only have MB_NUM_ORDERS(sb) lists. 150 152 * 151 153 * When "mb_optimize_scan" mount option is set, mballoc consults the above data 152 154 * structures to decide the order in which groups are to be traversed for ··· 162 160 * 163 161 * At CR = 1, we only consider groups where average fragment size > request 164 162 * size. So, we lookup a group which has average fragment size just above or 165 - * equal to request size using our rb tree (data structure 2) in O(log N) time. 163 + * equal to request size using our average fragment size group lists (data 164 + * structure 2) in O(1) time. 166 165 * 167 166 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in 168 167 * linear order which requires O(N) search time for each CR 0 and CR 1 phase. ··· 805 802 } 806 803 } 807 804 808 - static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, 809 - int (*cmp)(struct rb_node *, struct rb_node *)) 805 + static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) 810 806 { 811 - struct rb_node **iter = &root->rb_node, *parent = NULL; 807 + int order; 812 808 813 - while (*iter) { 814 - parent = *iter; 815 - if (cmp(new, *iter) > 0) 816 - iter = &((*iter)->rb_left); 817 - else 818 - iter = &((*iter)->rb_right); 819 - } 820 - 821 - rb_link_node(new, parent, iter); 822 - rb_insert_color(new, root); 809 + /* 810 + * We don't bother with a special lists groups with only 1 block free 811 + * extents and for completely empty groups. 812 + */ 813 + order = fls(len) - 2; 814 + if (order < 0) 815 + return 0; 816 + if (order == MB_NUM_ORDERS(sb)) 817 + order--; 818 + return order; 823 819 } 824 820 825 - static int 826 - ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) 827 - { 828 - struct ext4_group_info *grp1 = rb_entry(rb1, 829 - struct ext4_group_info, 830 - bb_avg_fragment_size_rb); 831 - struct ext4_group_info *grp2 = rb_entry(rb2, 832 - struct ext4_group_info, 833 - bb_avg_fragment_size_rb); 834 - int num_frags_1, num_frags_2; 835 - 836 - num_frags_1 = grp1->bb_fragments ? 837 - grp1->bb_free / grp1->bb_fragments : 0; 838 - num_frags_2 = grp2->bb_fragments ? 839 - grp2->bb_free / grp2->bb_fragments : 0; 840 - 841 - return (num_frags_2 - num_frags_1); 842 - } 843 - 844 - /* 845 - * Reinsert grpinfo into the avg_fragment_size tree with new average 846 - * fragment size. 847 - */ 821 + /* Move group to appropriate avg_fragment_size list */ 848 822 static void 849 823 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 850 824 { 851 825 struct ext4_sb_info *sbi = EXT4_SB(sb); 826 + int new_order; 852 827 853 828 if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) 854 829 return; 855 830 856 - write_lock(&sbi->s_mb_rb_lock); 857 - if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { 858 - rb_erase(&grp->bb_avg_fragment_size_rb, 859 - &sbi->s_mb_avg_fragment_size_root); 860 - RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); 861 - } 831 + new_order = mb_avg_fragment_size_order(sb, 832 + grp->bb_free / grp->bb_fragments); 833 + if (new_order == grp->bb_avg_fragment_size_order) 834 + return; 862 835 863 - ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, 864 - &grp->bb_avg_fragment_size_rb, 865 - ext4_mb_avg_fragment_size_cmp); 866 - write_unlock(&sbi->s_mb_rb_lock); 836 + if (grp->bb_avg_fragment_size_order != -1) { 837 + write_lock(&sbi->s_mb_avg_fragment_size_locks[ 838 + grp->bb_avg_fragment_size_order]); 839 + list_del(&grp->bb_avg_fragment_size_node); 840 + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 841 + grp->bb_avg_fragment_size_order]); 842 + } 843 + grp->bb_avg_fragment_size_order = new_order; 844 + write_lock(&sbi->s_mb_avg_fragment_size_locks[ 845 + grp->bb_avg_fragment_size_order]); 846 + list_add_tail(&grp->bb_avg_fragment_size_node, 847 + &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); 848 + write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 849 + grp->bb_avg_fragment_size_order]); 867 850 } 868 851 869 852 /* ··· 898 909 *new_cr = 1; 899 910 } else { 900 911 *group = grp->bb_group; 901 - ac->ac_last_optimal_group = *group; 902 912 ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; 903 913 } 904 914 } 905 915 906 916 /* 907 - * Choose next group by traversing average fragment size tree. Updates *new_cr 908 - * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that 909 - * the linear search should continue for one iteration since there's lock 910 - * contention on the rb tree lock. 917 + * Choose next group by traversing average fragment size list of suitable 918 + * order. Updates *new_cr if cr level needs an update. 911 919 */ 912 920 static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, 913 921 int *new_cr, ext4_group_t *group, ext4_group_t ngroups) 914 922 { 915 923 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 916 - int avg_fragment_size, best_so_far; 917 - struct rb_node *node, *found; 918 - struct ext4_group_info *grp; 919 - 920 - /* 921 - * If there is contention on the lock, instead of waiting for the lock 922 - * to become available, just continue searching lineraly. We'll resume 923 - * our rb tree search later starting at ac->ac_last_optimal_group. 924 - */ 925 - if (!read_trylock(&sbi->s_mb_rb_lock)) { 926 - ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; 927 - return; 928 - } 924 + struct ext4_group_info *grp, *iter; 925 + int i; 929 926 930 927 if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { 931 928 if (sbi->s_mb_stats) 932 929 atomic_inc(&sbi->s_bal_cr1_bad_suggestions); 933 - /* We have found something at CR 1 in the past */ 934 - grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); 935 - for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; 936 - found = rb_next(found)) { 937 - grp = rb_entry(found, struct ext4_group_info, 938 - bb_avg_fragment_size_rb); 930 + } 931 + 932 + for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 933 + i < MB_NUM_ORDERS(ac->ac_sb); i++) { 934 + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) 935 + continue; 936 + read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); 937 + if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { 938 + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 939 + continue; 940 + } 941 + grp = NULL; 942 + list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], 943 + bb_avg_fragment_size_node) { 939 944 if (sbi->s_mb_stats) 940 945 atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); 941 - if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) 946 + if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { 947 + grp = iter; 942 948 break; 943 - } 944 - goto done; 945 - } 946 - 947 - node = sbi->s_mb_avg_fragment_size_root.rb_node; 948 - best_so_far = 0; 949 - found = NULL; 950 - 951 - while (node) { 952 - grp = rb_entry(node, struct ext4_group_info, 953 - bb_avg_fragment_size_rb); 954 - avg_fragment_size = 0; 955 - if (ext4_mb_good_group(ac, grp->bb_group, 1)) { 956 - avg_fragment_size = grp->bb_fragments ? 957 - grp->bb_free / grp->bb_fragments : 0; 958 - if (!best_so_far || avg_fragment_size < best_so_far) { 959 - best_so_far = avg_fragment_size; 960 - found = node; 961 949 } 962 950 } 963 - if (avg_fragment_size > ac->ac_g_ex.fe_len) 964 - node = node->rb_right; 965 - else 966 - node = node->rb_left; 951 + read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); 952 + if (grp) 953 + break; 967 954 } 968 955 969 - done: 970 - if (found) { 971 - grp = rb_entry(found, struct ext4_group_info, 972 - bb_avg_fragment_size_rb); 956 + if (grp) { 973 957 *group = grp->bb_group; 974 958 ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; 975 959 } else { 976 960 *new_cr = 2; 977 961 } 978 - 979 - read_unlock(&sbi->s_mb_rb_lock); 980 - ac->ac_last_optimal_group = *group; 981 962 } 982 963 983 964 static inline int should_optimize_scan(struct ext4_allocation_context *ac) ··· 973 1014 974 1015 if (ac->ac_groups_linear_remaining) { 975 1016 ac->ac_groups_linear_remaining--; 976 - goto inc_and_return; 977 - } 978 - 979 - if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { 980 - ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; 981 1017 goto inc_and_return; 982 1018 } 983 1019 ··· 1003 1049 { 1004 1050 *new_cr = ac->ac_criteria; 1005 1051 1006 - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) 1052 + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { 1053 + *group = next_linear_group(ac, *group, ngroups); 1007 1054 return; 1055 + } 1008 1056 1009 1057 if (*new_cr == 0) { 1010 1058 ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); ··· 1031 1075 struct ext4_sb_info *sbi = EXT4_SB(sb); 1032 1076 int i; 1033 1077 1034 - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { 1078 + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) 1079 + if (grp->bb_counters[i] > 0) 1080 + break; 1081 + /* No need to move between order lists? */ 1082 + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || 1083 + i == grp->bb_largest_free_order) { 1084 + grp->bb_largest_free_order = i; 1085 + return; 1086 + } 1087 + 1088 + if (grp->bb_largest_free_order >= 0) { 1035 1089 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1036 1090 grp->bb_largest_free_order]); 1037 1091 list_del_init(&grp->bb_largest_free_order_node); 1038 1092 write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1039 1093 grp->bb_largest_free_order]); 1040 1094 } 1041 - grp->bb_largest_free_order = -1; /* uninit */ 1042 - 1043 - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { 1044 - if (grp->bb_counters[i] > 0) { 1045 - grp->bb_largest_free_order = i; 1046 - break; 1047 - } 1048 - } 1049 - if (test_opt2(sb, MB_OPTIMIZE_SCAN) && 1050 - grp->bb_largest_free_order >= 0 && grp->bb_free) { 1095 + grp->bb_largest_free_order = i; 1096 + if (grp->bb_largest_free_order >= 0 && grp->bb_free) { 1051 1097 write_lock(&sbi->s_mb_largest_free_orders_locks[ 1052 1098 grp->bb_largest_free_order]); 1053 1099 list_add_tail(&grp->bb_largest_free_order_node, ··· 1106 1148 EXT4_GROUP_INFO_BBITMAP_CORRUPT); 1107 1149 } 1108 1150 mb_set_largest_free_order(sb, grp); 1151 + mb_update_avg_fragment_size(sb, grp); 1109 1152 1110 1153 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 1111 1154 1112 1155 period = get_cycles() - period; 1113 1156 atomic_inc(&sbi->s_mb_buddies_generated); 1114 1157 atomic64_add(period, &sbi->s_mb_generation_time); 1115 - mb_update_avg_fragment_size(sb, grp); 1116 1158 } 1117 1159 1118 1160 /* The buddy information is attached the buddy cache inode ··· 2594 2636 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2595 2637 { 2596 2638 ext4_group_t prefetch_grp = 0, ngroups, group, i; 2597 - int cr = -1; 2639 + int cr = -1, new_cr; 2598 2640 int err = 0, first_err = 0; 2599 2641 unsigned int nr = 0, prefetch_ios = 0; 2600 2642 struct ext4_sb_info *sbi; ··· 2665 2707 * from the goal value specified 2666 2708 */ 2667 2709 group = ac->ac_g_ex.fe_group; 2668 - ac->ac_last_optimal_group = group; 2669 2710 ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 2670 2711 prefetch_grp = group; 2671 2712 2672 - for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), 2673 - i++) { 2674 - int ret = 0, new_cr; 2713 + for (i = 0, new_cr = cr; i < ngroups; i++, 2714 + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { 2715 + int ret = 0; 2675 2716 2676 2717 cond_resched(); 2677 - 2678 - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); 2679 2718 if (new_cr != cr) { 2680 2719 cr = new_cr; 2681 2720 goto repeat; ··· 2946 2991 struct super_block *sb = pde_data(file_inode(seq->file)); 2947 2992 unsigned long position; 2948 2993 2949 - read_lock(&EXT4_SB(sb)->s_mb_rb_lock); 2950 - 2951 - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) 2994 + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2952 2995 return NULL; 2953 2996 position = *pos + 1; 2954 2997 return (void *) ((unsigned long) position); ··· 2958 3005 unsigned long position; 2959 3006 2960 3007 ++*pos; 2961 - if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) 3008 + if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) 2962 3009 return NULL; 2963 3010 position = *pos + 1; 2964 3011 return (void *) ((unsigned long) position); ··· 2970 3017 struct ext4_sb_info *sbi = EXT4_SB(sb); 2971 3018 unsigned long position = ((unsigned long) v); 2972 3019 struct ext4_group_info *grp; 2973 - struct rb_node *n; 2974 - unsigned int count, min, max; 3020 + unsigned int count; 2975 3021 2976 3022 position--; 2977 3023 if (position >= MB_NUM_ORDERS(sb)) { 2978 - seq_puts(seq, "fragment_size_tree:\n"); 2979 - n = rb_first(&sbi->s_mb_avg_fragment_size_root); 2980 - if (!n) { 2981 - seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); 2982 - return 0; 2983 - } 2984 - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); 2985 - min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; 2986 - count = 1; 2987 - while (rb_next(n)) { 2988 - count++; 2989 - n = rb_next(n); 2990 - } 2991 - grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); 2992 - max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; 3024 + position -= MB_NUM_ORDERS(sb); 3025 + if (position == 0) 3026 + seq_puts(seq, "avg_fragment_size_lists:\n"); 2993 3027 2994 - seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", 2995 - min, max, count); 3028 + count = 0; 3029 + read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); 3030 + list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], 3031 + bb_avg_fragment_size_node) 3032 + count++; 3033 + read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); 3034 + seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3035 + (unsigned int)position, count); 2996 3036 return 0; 2997 3037 } 2998 3038 ··· 2995 3049 seq_puts(seq, "max_free_order_lists:\n"); 2996 3050 } 2997 3051 count = 0; 3052 + read_lock(&sbi->s_mb_largest_free_orders_locks[position]); 2998 3053 list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 2999 3054 bb_largest_free_order_node) 3000 3055 count++; 3056 + read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); 3001 3057 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3002 3058 (unsigned int)position, count); 3003 3059 ··· 3007 3059 } 3008 3060 3009 3061 static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) 3010 - __releases(&EXT4_SB(sb)->s_mb_rb_lock) 3011 3062 { 3012 - struct super_block *sb = pde_data(file_inode(seq->file)); 3013 - 3014 - read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); 3015 3063 } 3016 3064 3017 3065 const struct seq_operations ext4_mb_seq_structs_summary_ops = { ··· 3120 3176 init_rwsem(&meta_group_info[i]->alloc_sem); 3121 3177 meta_group_info[i]->bb_free_root = RB_ROOT; 3122 3178 INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 3123 - RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); 3179 + INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); 3124 3180 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 3181 + meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ 3125 3182 meta_group_info[i]->bb_group = group; 3126 3183 3127 3184 mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); ··· 3371 3426 i++; 3372 3427 } while (i < MB_NUM_ORDERS(sb)); 3373 3428 3374 - sbi->s_mb_avg_fragment_size_root = RB_ROOT; 3429 + sbi->s_mb_avg_fragment_size = 3430 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3431 + GFP_KERNEL); 3432 + if (!sbi->s_mb_avg_fragment_size) { 3433 + ret = -ENOMEM; 3434 + goto out; 3435 + } 3436 + sbi->s_mb_avg_fragment_size_locks = 3437 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3438 + GFP_KERNEL); 3439 + if (!sbi->s_mb_avg_fragment_size_locks) { 3440 + ret = -ENOMEM; 3441 + goto out; 3442 + } 3443 + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3444 + INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); 3445 + rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); 3446 + } 3375 3447 sbi->s_mb_largest_free_orders = 3376 3448 kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3377 3449 GFP_KERNEL); ··· 3407 3445 INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 3408 3446 rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 3409 3447 } 3410 - rwlock_init(&sbi->s_mb_rb_lock); 3411 3448 3412 3449 spin_lock_init(&sbi->s_md_lock); 3413 3450 sbi->s_mb_free_pending = 0; ··· 3477 3516 free_percpu(sbi->s_locality_groups); 3478 3517 sbi->s_locality_groups = NULL; 3479 3518 out: 3519 + kfree(sbi->s_mb_avg_fragment_size); 3520 + kfree(sbi->s_mb_avg_fragment_size_locks); 3480 3521 kfree(sbi->s_mb_largest_free_orders); 3481 3522 kfree(sbi->s_mb_largest_free_orders_locks); 3482 3523 kfree(sbi->s_mb_offsets); ··· 3545 3582 kvfree(group_info); 3546 3583 rcu_read_unlock(); 3547 3584 } 3585 + kfree(sbi->s_mb_avg_fragment_size); 3586 + kfree(sbi->s_mb_avg_fragment_size_locks); 3548 3587 kfree(sbi->s_mb_largest_free_orders); 3549 3588 kfree(sbi->s_mb_largest_free_orders_locks); 3550 3589 kfree(sbi->s_mb_offsets); ··· 5158 5193 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 5159 5194 int bsbits = ac->ac_sb->s_blocksize_bits; 5160 5195 loff_t size, isize; 5196 + bool inode_pa_eligible, group_pa_eligible; 5161 5197 5162 5198 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 5163 5199 return; ··· 5166 5200 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 5167 5201 return; 5168 5202 5203 + group_pa_eligible = sbi->s_mb_group_prealloc > 0; 5204 + inode_pa_eligible = true; 5169 5205 size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); 5170 5206 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 5171 5207 >> bsbits; 5172 5208 5209 + /* No point in using inode preallocation for closed files */ 5173 5210 if ((size == isize) && !ext4_fs_is_busy(sbi) && 5174 - !inode_is_open_for_write(ac->ac_inode)) { 5175 - ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 5176 - return; 5177 - } 5211 + !inode_is_open_for_write(ac->ac_inode)) 5212 + inode_pa_eligible = false; 5178 5213 5179 - if (sbi->s_mb_group_prealloc <= 0) { 5180 - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5181 - return; 5182 - } 5183 - 5184 - /* don't use group allocation for large files */ 5185 5214 size = max(size, isize); 5186 - if (size > sbi->s_mb_stream_request) { 5187 - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5215 + /* Don't use group allocation for large files */ 5216 + if (size > sbi->s_mb_stream_request) 5217 + group_pa_eligible = false; 5218 + 5219 + if (!group_pa_eligible) { 5220 + if (inode_pa_eligible) 5221 + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 5222 + else 5223 + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 5188 5224 return; 5189 5225 } 5190 5226 ··· 5533 5565 ext4_fsblk_t block = 0; 5534 5566 unsigned int inquota = 0; 5535 5567 unsigned int reserv_clstrs = 0; 5568 + int retries = 0; 5536 5569 u64 seq; 5537 5570 5538 5571 might_sleep(); ··· 5636 5667 ar->len = ac->ac_b_ex.fe_len; 5637 5668 } 5638 5669 } else { 5639 - if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) 5670 + if (++retries < 3 && 5671 + ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) 5640 5672 goto repeat; 5641 5673 /* 5642 5674 * If block allocation fails then the pa allocated above
-1
fs/ext4/mballoc.h
··· 178 178 /* copy of the best found extent taken before preallocation efforts */ 179 179 struct ext4_free_extent ac_f_ex; 180 180 181 - ext4_group_t ac_last_optimal_group; 182 181 __u32 ac_groups_considered; 183 182 __u32 ac_flags; /* allocation hints */ 184 183 __u16 ac_groups_scanned;