Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+3 -36

fs/btrfs/acl.c

··· 55 55 return acl; 56 56 } 57 57 58 - static int __btrfs_set_acl(struct btrfs_trans_handle *trans, 59 - struct user_namespace *mnt_userns, 60 - struct inode *inode, struct posix_acl *acl, int type) 58 + int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, 59 + struct posix_acl *acl, int type) 61 60 { 62 61 int ret, size = 0; 63 62 const char *name; ··· 122 123 if (ret) 123 124 return ret; 124 125 } 125 - ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); 126 + ret = __btrfs_set_acl(NULL, inode, acl, type); 126 127 if (ret) 127 128 inode->i_mode = old_mode; 128 - return ret; 129 - } 130 - 131 - int btrfs_init_acl(struct btrfs_trans_handle *trans, 132 - struct inode *inode, struct inode *dir) 133 - { 134 - struct posix_acl *default_acl, *acl; 135 - int ret = 0; 136 - 137 - /* this happens with subvols */ 138 - if (!dir) 139 - return 0; 140 - 141 - ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); 142 - if (ret) 143 - return ret; 144 - 145 - if (default_acl) { 146 - ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, 147 - ACL_TYPE_DEFAULT); 148 - posix_acl_release(default_acl); 149 - } 150 - 151 - if (acl) { 152 - if (!ret) 153 - ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, 154 - ACL_TYPE_ACCESS); 155 - posix_acl_release(acl); 156 - } 157 - 158 - if (!default_acl && !acl) 159 - cache_no_acl(inode); 160 129 return ret; 161 130 }

+23 -101

fs/btrfs/async-thread.c

··· 15 15 enum { 16 16 WORK_DONE_BIT, 17 17 WORK_ORDER_DONE_BIT, 18 - WORK_HIGH_PRIO_BIT, 19 18 }; 20 19 21 20 #define NO_THRESHOLD (-1) 22 21 #define DFT_THRESHOLD (32) 23 22 24 - struct __btrfs_workqueue { 23 + struct btrfs_workqueue { 25 24 struct workqueue_struct *normal_wq; 26 25 27 26 /* File system this workqueue services */ ··· 47 48 spinlock_t thres_lock; 48 49 }; 49 50 50 - struct btrfs_workqueue { 51 - struct __btrfs_workqueue *normal; 52 - struct __btrfs_workqueue *high; 53 - }; 54 - 55 - struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) 51 + struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) 56 52 { 57 53 return wq->fs_info; 58 54 } ··· 60 66 bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) 61 67 { 62 68 /* 63 - * We could compare wq->normal->pending with num_online_cpus() 69 + * We could compare wq->pending with num_online_cpus() 64 70 * to support "thresh == NO_THRESHOLD" case, but it requires 65 71 * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's 66 72 * postpone it until someone needs the support of that case. 67 73 */ 68 - if (wq->normal->thresh == NO_THRESHOLD) 74 + if (wq->thresh == NO_THRESHOLD) 69 75 return false; 70 76 71 - return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; 77 + return atomic_read(&wq->pending) > wq->thresh * 2; 72 78 } 73 79 74 - static struct __btrfs_workqueue * 75 - __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, 76 - unsigned int flags, int limit_active, int thresh) 80 + struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, 81 + const char *name, unsigned int flags, 82 + int limit_active, int thresh) 77 83 { 78 - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); 84 + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); 79 85 80 86 if (!ret) 81 87 return NULL; ··· 99 105 ret->thresh = thresh; 100 106 } 101 107 102 - if (flags & WQ_HIGHPRI) 103 - ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, 104 - ret->current_active, name); 105 - else 106 - ret->normal_wq = alloc_workqueue("btrfs-%s", flags, 107 - ret->current_active, name); 108 + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, 109 + name); 108 110 if (!ret->normal_wq) { 109 111 kfree(ret); 110 112 return NULL; ··· 109 119 INIT_LIST_HEAD(&ret->ordered_list); 110 120 spin_lock_init(&ret->list_lock); 111 121 spin_lock_init(&ret->thres_lock); 112 - trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); 113 - return ret; 114 - } 115 - 116 - static inline void 117 - __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); 118 - 119 - struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, 120 - const char *name, 121 - unsigned int flags, 122 - int limit_active, 123 - int thresh) 124 - { 125 - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); 126 - 127 - if (!ret) 128 - return NULL; 129 - 130 - ret->normal = __btrfs_alloc_workqueue(fs_info, name, 131 - flags & ~WQ_HIGHPRI, 132 - limit_active, thresh); 133 - if (!ret->normal) { 134 - kfree(ret); 135 - return NULL; 136 - } 137 - 138 - if (flags & WQ_HIGHPRI) { 139 - ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, 140 - limit_active, thresh); 141 - if (!ret->high) { 142 - __btrfs_destroy_workqueue(ret->normal); 143 - kfree(ret); 144 - return NULL; 145 - } 146 - } 122 + trace_btrfs_workqueue_alloc(ret, name); 147 123 return ret; 148 124 } 149 125 ··· 118 162 * This hook WILL be called in IRQ handler context, 119 163 * so workqueue_set_max_active MUST NOT be called in this hook 120 164 */ 121 - static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) 165 + static inline void thresh_queue_hook(struct btrfs_workqueue *wq) 122 166 { 123 167 if (wq->thresh == NO_THRESHOLD) 124 168 return; ··· 130 174 * This hook is called in kthread content. 131 175 * So workqueue_set_max_active is called here. 132 176 */ 133 - static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) 177 + static inline void thresh_exec_hook(struct btrfs_workqueue *wq) 134 178 { 135 179 int new_current_active; 136 180 long pending; ··· 173 217 } 174 218 } 175 219 176 - static void run_ordered_work(struct __btrfs_workqueue *wq, 220 + static void run_ordered_work(struct btrfs_workqueue *wq, 177 221 struct btrfs_work *self) 178 222 { 179 223 struct list_head *list = &wq->ordered_list; ··· 261 305 { 262 306 struct btrfs_work *work = container_of(normal_work, struct btrfs_work, 263 307 normal_work); 264 - struct __btrfs_workqueue *wq; 308 + struct btrfs_workqueue *wq = work->wq; 265 309 int need_order = 0; 266 310 267 311 /* ··· 274 318 */ 275 319 if (work->ordered_func) 276 320 need_order = 1; 277 - wq = work->wq; 278 321 279 322 trace_btrfs_work_sched(work); 280 323 thresh_exec_hook(wq); ··· 305 350 work->flags = 0; 306 351 } 307 352 308 - static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, 309 - struct btrfs_work *work) 353 + void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) 310 354 { 311 355 unsigned long flags; 312 356 ··· 320 366 queue_work(wq->normal_wq, &work->normal_work); 321 367 } 322 368 323 - void btrfs_queue_work(struct btrfs_workqueue *wq, 324 - struct btrfs_work *work) 369 + void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) 325 370 { 326 - struct __btrfs_workqueue *dest_wq; 327 - 328 - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) 329 - dest_wq = wq->high; 330 - else 331 - dest_wq = wq->normal; 332 - __btrfs_queue_work(dest_wq, work); 333 - } 334 - 335 - static inline void 336 - __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) 337 - { 371 + if (!wq) 372 + return; 338 373 destroy_workqueue(wq->normal_wq); 339 374 trace_btrfs_workqueue_destroy(wq); 340 375 kfree(wq); 341 376 } 342 377 343 - void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) 344 - { 345 - if (!wq) 346 - return; 347 - if (wq->high) 348 - __btrfs_destroy_workqueue(wq->high); 349 - __btrfs_destroy_workqueue(wq->normal); 350 - kfree(wq); 351 - } 352 - 353 378 void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) 354 379 { 355 - if (!wq) 356 - return; 357 - wq->normal->limit_active = limit_active; 358 - if (wq->high) 359 - wq->high->limit_active = limit_active; 360 - } 361 - 362 - void btrfs_set_work_high_priority(struct btrfs_work *work) 363 - { 364 - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); 380 + if (wq) 381 + wq->limit_active = limit_active; 365 382 } 366 383 367 384 void btrfs_flush_workqueue(struct btrfs_workqueue *wq) 368 385 { 369 - if (wq->high) 370 - flush_workqueue(wq->high->normal_wq); 371 - 372 - flush_workqueue(wq->normal->normal_wq); 386 + flush_workqueue(wq->normal_wq); 373 387 }

+2 -5

fs/btrfs/async-thread.h

··· 11 11 12 12 struct btrfs_fs_info; 13 13 struct btrfs_workqueue; 14 - /* Internal use only */ 15 - struct __btrfs_workqueue; 16 14 struct btrfs_work; 17 15 typedef void (*btrfs_func_t)(struct btrfs_work *arg); 18 16 typedef void (*btrfs_work_func_t)(struct work_struct *arg); ··· 23 25 /* Don't touch things below */ 24 26 struct work_struct normal_work; 25 27 struct list_head ordered_list; 26 - struct __btrfs_workqueue *wq; 28 + struct btrfs_workqueue *wq; 27 29 unsigned long flags; 28 30 }; 29 31 ··· 38 40 struct btrfs_work *work); 39 41 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); 40 42 void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); 41 - void btrfs_set_work_high_priority(struct btrfs_work *work); 42 43 struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); 43 - struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); 44 + struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); 44 45 bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); 45 46 void btrfs_flush_workqueue(struct btrfs_workqueue *wq); 46 47

+125 -80

fs/btrfs/block-group.c

··· 168 168 struct rb_node **p; 169 169 struct rb_node *parent = NULL; 170 170 struct btrfs_block_group *cache; 171 + bool leftmost = true; 171 172 172 173 ASSERT(block_group->length != 0); 173 174 174 - spin_lock(&info->block_group_cache_lock); 175 - p = &info->block_group_cache_tree.rb_node; 175 + write_lock(&info->block_group_cache_lock); 176 + p = &info->block_group_cache_tree.rb_root.rb_node; 176 177 177 178 while (*p) { 178 179 parent = *p; ··· 182 181 p = &(*p)->rb_left; 183 182 } else if (block_group->start > cache->start) { 184 183 p = &(*p)->rb_right; 184 + leftmost = false; 185 185 } else { 186 - spin_unlock(&info->block_group_cache_lock); 186 + write_unlock(&info->block_group_cache_lock); 187 187 return -EEXIST; 188 188 } 189 189 } 190 190 191 191 rb_link_node(&block_group->cache_node, parent, p); 192 - rb_insert_color(&block_group->cache_node, 193 - &info->block_group_cache_tree); 192 + rb_insert_color_cached(&block_group->cache_node, 193 + &info->block_group_cache_tree, leftmost); 194 194 195 - if (info->first_logical_byte > block_group->start) 196 - info->first_logical_byte = block_group->start; 197 - 198 - spin_unlock(&info->block_group_cache_lock); 195 + write_unlock(&info->block_group_cache_lock); 199 196 200 197 return 0; 201 198 } ··· 209 210 struct rb_node *n; 210 211 u64 end, start; 211 212 212 - spin_lock(&info->block_group_cache_lock); 213 - n = info->block_group_cache_tree.rb_node; 213 + read_lock(&info->block_group_cache_lock); 214 + n = info->block_group_cache_tree.rb_root.rb_node; 214 215 215 216 while (n) { 216 217 cache = rb_entry(n, struct btrfs_block_group, cache_node); ··· 232 233 break; 233 234 } 234 235 } 235 - if (ret) { 236 + if (ret) 236 237 btrfs_get_block_group(ret); 237 - if (bytenr == 0 && info->first_logical_byte > ret->start) 238 - info->first_logical_byte = ret->start; 239 - } 240 - spin_unlock(&info->block_group_cache_lock); 238 + read_unlock(&info->block_group_cache_lock); 241 239 242 240 return ret; 243 241 } ··· 263 267 struct btrfs_fs_info *fs_info = cache->fs_info; 264 268 struct rb_node *node; 265 269 266 - spin_lock(&fs_info->block_group_cache_lock); 270 + read_lock(&fs_info->block_group_cache_lock); 267 271 268 272 /* If our block group was removed, we need a full search. */ 269 273 if (RB_EMPTY_NODE(&cache->cache_node)) { 270 274 const u64 next_bytenr = cache->start + cache->length; 271 275 272 - spin_unlock(&fs_info->block_group_cache_lock); 276 + read_unlock(&fs_info->block_group_cache_lock); 273 277 btrfs_put_block_group(cache); 274 - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 278 + return btrfs_lookup_first_block_group(fs_info, next_bytenr); 275 279 } 276 280 node = rb_next(&cache->cache_node); 277 281 btrfs_put_block_group(cache); ··· 280 284 btrfs_get_block_group(cache); 281 285 } else 282 286 cache = NULL; 283 - spin_unlock(&fs_info->block_group_cache_lock); 287 + read_unlock(&fs_info->block_group_cache_lock); 284 288 return cache; 285 289 } 286 290 287 - bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 291 + /** 292 + * Check if we can do a NOCOW write for a given extent. 293 + * 294 + * @fs_info: The filesystem information object. 295 + * @bytenr: Logical start address of the extent. 296 + * 297 + * Check if we can do a NOCOW write for the given extent, and increments the 298 + * number of NOCOW writers in the block group that contains the extent, as long 299 + * as the block group exists and it's currently not in read-only mode. 300 + * 301 + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller 302 + * is responsible for calling btrfs_dec_nocow_writers() later. 303 + * 304 + * Or NULL if we can not do a NOCOW write 305 + */ 306 + struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 307 + u64 bytenr) 288 308 { 289 309 struct btrfs_block_group *bg; 290 - bool ret = true; 310 + bool can_nocow = true; 291 311 292 312 bg = btrfs_lookup_block_group(fs_info, bytenr); 293 313 if (!bg) 294 - return false; 314 + return NULL; 295 315 296 316 spin_lock(&bg->lock); 297 317 if (bg->ro) 298 - ret = false; 318 + can_nocow = false; 299 319 else 300 320 atomic_inc(&bg->nocow_writers); 301 321 spin_unlock(&bg->lock); 302 322 303 - /* No put on block group, done by btrfs_dec_nocow_writers */ 304 - if (!ret) 323 + if (!can_nocow) { 305 324 btrfs_put_block_group(bg); 325 + return NULL; 326 + } 306 327 307 - return ret; 328 + /* No put on block group, done by btrfs_dec_nocow_writers(). */ 329 + return bg; 308 330 } 309 331 310 - void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 332 + /** 333 + * Decrement the number of NOCOW writers in a block group. 334 + * 335 + * @bg: The block group. 336 + * 337 + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), 338 + * and on the block group returned by that call. Typically this is called after 339 + * creating an ordered extent for a NOCOW write, to prevent races with scrub and 340 + * relocation. 341 + * 342 + * After this call, the caller should not use the block group anymore. It it wants 343 + * to use it, then it should get a reference on it before calling this function. 344 + */ 345 + void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) 311 346 { 312 - struct btrfs_block_group *bg; 313 - 314 - bg = btrfs_lookup_block_group(fs_info, bytenr); 315 - ASSERT(bg); 316 347 if (atomic_dec_and_test(&bg->nocow_writers)) 317 348 wake_up_var(&bg->nocow_writers); 318 - /* 319 - * Once for our lookup and once for the lookup done by a previous call 320 - * to btrfs_inc_nocow_writers() 321 - */ 322 - btrfs_put_block_group(bg); 349 + 350 + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ 323 351 btrfs_put_block_group(bg); 324 352 } 325 353 ··· 792 772 cache->has_caching_ctl = 1; 793 773 spin_unlock(&cache->lock); 794 774 795 - spin_lock(&fs_info->block_group_cache_lock); 775 + write_lock(&fs_info->block_group_cache_lock); 796 776 refcount_inc(&caching_ctl->count); 797 777 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 798 - spin_unlock(&fs_info->block_group_cache_lock); 778 + write_unlock(&fs_info->block_group_cache_lock); 799 779 800 780 btrfs_get_block_group(cache); 801 781 ··· 977 957 if (ret) 978 958 goto out; 979 959 980 - spin_lock(&fs_info->block_group_cache_lock); 981 - rb_erase(&block_group->cache_node, 982 - &fs_info->block_group_cache_tree); 960 + write_lock(&fs_info->block_group_cache_lock); 961 + rb_erase_cached(&block_group->cache_node, 962 + &fs_info->block_group_cache_tree); 983 963 RB_CLEAR_NODE(&block_group->cache_node); 984 964 985 965 /* Once for the block groups rbtree */ 986 966 btrfs_put_block_group(block_group); 987 967 988 - if (fs_info->first_logical_byte == block_group->start) 989 - fs_info->first_logical_byte = (u64)-1; 990 - spin_unlock(&fs_info->block_group_cache_lock); 968 + write_unlock(&fs_info->block_group_cache_lock); 991 969 992 970 down_write(&block_group->space_info->groups_sem); 993 971 /* ··· 1010 992 if (block_group->cached == BTRFS_CACHE_STARTED) 1011 993 btrfs_wait_block_group_cache_done(block_group); 1012 994 if (block_group->has_caching_ctl) { 1013 - spin_lock(&fs_info->block_group_cache_lock); 995 + write_lock(&fs_info->block_group_cache_lock); 1014 996 if (!caching_ctl) { 1015 997 struct btrfs_caching_control *ctl; 1016 998 ··· 1024 1006 } 1025 1007 if (caching_ctl) 1026 1008 list_del_init(&caching_ctl->list); 1027 - spin_unlock(&fs_info->block_group_cache_lock); 1009 + write_unlock(&fs_info->block_group_cache_lock); 1028 1010 if (caching_ctl) { 1029 1011 /* Once for the caching bgs list and once for us. */ 1030 1012 btrfs_put_caching_control(caching_ctl); ··· 1385 1367 goto next; 1386 1368 } 1387 1369 1370 + ret = btrfs_zone_finish(block_group); 1371 + if (ret < 0) { 1372 + btrfs_dec_block_group_ro(block_group); 1373 + if (ret == -EAGAIN) 1374 + ret = 0; 1375 + goto next; 1376 + } 1377 + 1388 1378 /* 1389 1379 * Want to do this before we do anything else so we can recover 1390 1380 * properly if we fail to join the transaction. ··· 1538 1512 return bg1->used > bg2->used; 1539 1513 } 1540 1514 1515 + static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) 1516 + { 1517 + if (btrfs_is_zoned(fs_info)) 1518 + return btrfs_zoned_should_reclaim(fs_info); 1519 + return true; 1520 + } 1521 + 1541 1522 void btrfs_reclaim_bgs_work(struct work_struct *work) 1542 1523 { 1543 1524 struct btrfs_fs_info *fs_info = ··· 1553 1520 struct btrfs_space_info *space_info; 1554 1521 1555 1522 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1523 + return; 1524 + 1525 + if (!btrfs_should_reclaim(fs_info)) 1556 1526 return; 1557 1527 1558 1528 sb_start_write(fs_info->sb); ··· 1728 1692 struct btrfs_root *root = btrfs_block_group_root(fs_info); 1729 1693 int ret; 1730 1694 struct btrfs_key found_key; 1731 - struct extent_buffer *leaf; 1732 - int slot; 1733 1695 1734 - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1735 - if (ret < 0) 1736 - return ret; 1737 - 1738 - while (1) { 1739 - slot = path->slots[0]; 1740 - leaf = path->nodes[0]; 1741 - if (slot >= btrfs_header_nritems(leaf)) { 1742 - ret = btrfs_next_leaf(root, path); 1743 - if (ret == 0) 1744 - continue; 1745 - if (ret < 0) 1746 - goto out; 1747 - break; 1748 - } 1749 - btrfs_item_key_to_cpu(leaf, &found_key, slot); 1750 - 1696 + btrfs_for_each_slot(root, key, &found_key, path, ret) { 1751 1697 if (found_key.objectid >= key->objectid && 1752 1698 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 1753 - ret = read_bg_from_eb(fs_info, &found_key, path); 1754 - break; 1699 + return read_bg_from_eb(fs_info, &found_key, path); 1755 1700 } 1756 - 1757 - path->slots[0]++; 1758 1701 } 1759 - out: 1760 1702 return ret; 1761 1703 } 1762 1704 ··· 3234 3220 return ret; 3235 3221 } 3236 3222 3223 + static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, 3224 + u64 bytes_freed) 3225 + { 3226 + const struct btrfs_space_info *space_info = bg->space_info; 3227 + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); 3228 + const u64 new_val = bg->used; 3229 + const u64 old_val = new_val + bytes_freed; 3230 + u64 thresh; 3231 + 3232 + if (reclaim_thresh == 0) 3233 + return false; 3234 + 3235 + thresh = div_factor_fine(bg->length, reclaim_thresh); 3236 + 3237 + /* 3238 + * If we were below the threshold before don't reclaim, we are likely a 3239 + * brand new block group and we don't want to relocate new block groups. 3240 + */ 3241 + if (old_val < thresh) 3242 + return false; 3243 + if (new_val >= thresh) 3244 + return false; 3245 + return true; 3246 + } 3247 + 3237 3248 int btrfs_update_block_group(struct btrfs_trans_handle *trans, 3238 3249 u64 bytenr, u64 num_bytes, bool alloc) 3239 3250 { ··· 3281 3242 spin_unlock(&info->delalloc_root_lock); 3282 3243 3283 3244 while (total) { 3245 + bool reclaim; 3246 + 3284 3247 cache = btrfs_lookup_block_group(info, bytenr); 3285 3248 if (!cache) { 3286 3249 ret = -ENOENT; ··· 3328 3287 cache->space_info, num_bytes); 3329 3288 cache->space_info->bytes_used -= num_bytes; 3330 3289 cache->space_info->disk_used -= num_bytes * factor; 3290 + 3291 + reclaim = should_reclaim_block_group(cache, num_bytes); 3331 3292 spin_unlock(&cache->lock); 3332 3293 spin_unlock(&cache->space_info->lock); 3333 3294 ··· 3356 3313 if (!alloc && old_val == 0) { 3357 3314 if (!btrfs_test_opt(info, DISCARD_ASYNC)) 3358 3315 btrfs_mark_bg_unused(cache); 3316 + } else if (!alloc && reclaim) { 3317 + btrfs_mark_bg_to_reclaim(cache); 3359 3318 } 3360 3319 3361 3320 btrfs_put_block_group(cache); ··· 4002 3957 struct btrfs_caching_control *caching_ctl; 4003 3958 struct rb_node *n; 4004 3959 4005 - spin_lock(&info->block_group_cache_lock); 3960 + write_lock(&info->block_group_cache_lock); 4006 3961 while (!list_empty(&info->caching_block_groups)) { 4007 3962 caching_ctl = list_entry(info->caching_block_groups.next, 4008 3963 struct btrfs_caching_control, list); 4009 3964 list_del(&caching_ctl->list); 4010 3965 btrfs_put_caching_control(caching_ctl); 4011 3966 } 4012 - spin_unlock(&info->block_group_cache_lock); 3967 + write_unlock(&info->block_group_cache_lock); 4013 3968 4014 3969 spin_lock(&info->unused_bgs_lock); 4015 3970 while (!list_empty(&info->unused_bgs)) { ··· 4039 3994 } 4040 3995 spin_unlock(&info->zone_active_bgs_lock); 4041 3996 4042 - spin_lock(&info->block_group_cache_lock); 4043 - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 3997 + write_lock(&info->block_group_cache_lock); 3998 + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { 4044 3999 block_group = rb_entry(n, struct btrfs_block_group, 4045 4000 cache_node); 4046 - rb_erase(&block_group->cache_node, 4047 - &info->block_group_cache_tree); 4001 + rb_erase_cached(&block_group->cache_node, 4002 + &info->block_group_cache_tree); 4048 4003 RB_CLEAR_NODE(&block_group->cache_node); 4049 - spin_unlock(&info->block_group_cache_lock); 4004 + write_unlock(&info->block_group_cache_lock); 4050 4005 4051 4006 down_write(&block_group->space_info->groups_sem); 4052 4007 list_del(&block_group->list); ··· 4069 4024 ASSERT(block_group->swap_extents == 0); 4070 4025 btrfs_put_block_group(block_group); 4071 4026 4072 - spin_lock(&info->block_group_cache_lock); 4027 + write_lock(&info->block_group_cache_lock); 4073 4028 } 4074 - spin_unlock(&info->block_group_cache_lock); 4029 + write_unlock(&info->block_group_cache_lock); 4075 4030 4076 4031 btrfs_release_global_block_rsv(info); 4077 4032

+5 -2

fs/btrfs/block-group.h

··· 212 212 u64 meta_write_pointer; 213 213 struct map_lookup *physical_map; 214 214 struct list_head active_bg_list; 215 + struct work_struct zone_finish_work; 216 + struct extent_buffer *last_eb; 215 217 }; 216 218 217 219 static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) ··· 256 254 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 257 255 const u64 start); 258 256 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); 259 - bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); 260 - void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); 257 + struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, 258 + u64 bytenr); 259 + void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); 261 260 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); 262 261 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, 263 262 u64 num_bytes);

-25

fs/btrfs/btrfs_inode.h

··· 395 395 return true; 396 396 } 397 397 398 - struct btrfs_dio_private { 399 - struct inode *inode; 400 - 401 - /* 402 - * Since DIO can use anonymous page, we cannot use page_offset() to 403 - * grab the file offset, thus need a dedicated member for file offset. 404 - */ 405 - u64 file_offset; 406 - u64 disk_bytenr; 407 - /* Used for bio::bi_size */ 408 - u32 bytes; 409 - 410 - /* 411 - * References to this structure. There is one reference per in-flight 412 - * bio plus one while we're still setting up. 413 - */ 414 - refcount_t refs; 415 - 416 - /* dio_bio came from fs/direct-io.c */ 417 - struct bio *dio_bio; 418 - 419 - /* Array of checksums */ 420 - u8 csums[]; 421 - }; 422 - 423 398 /* 424 399 * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two 425 400 * separate u32s. These two functions convert between the two representations.

+84 -94

fs/btrfs/check-integrity.c

··· 1552 1552 return -ENOMEM; 1553 1553 block_ctx->datav = block_ctx->mem_to_free; 1554 1554 block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); 1555 - for (i = 0; i < num_pages; i++) { 1556 - block_ctx->pagev[i] = alloc_page(GFP_NOFS); 1557 - if (!block_ctx->pagev[i]) 1558 - return -1; 1559 - } 1555 + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); 1556 + if (ret) 1557 + return ret; 1560 1558 1561 1559 dev_bytenr = block_ctx->dev_bytenr; 1562 1560 for (i = 0; i < num_pages;) { 1563 1561 struct bio *bio; 1564 1562 unsigned int j; 1565 1563 1566 - bio = btrfs_bio_alloc(num_pages - i); 1567 - bio_set_dev(bio, block_ctx->dev->bdev); 1564 + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, 1565 + REQ_OP_READ, GFP_NOFS); 1568 1566 bio->bi_iter.bi_sector = dev_bytenr >> 9; 1569 - bio->bi_opf = REQ_OP_READ; 1570 1567 1571 1568 for (j = i; j < num_pages; j++) { 1572 1569 ret = bio_add_page(bio, block_ctx->pagev[j], ··· 2030 2033 2031 2034 static void btrfsic_bio_end_io(struct bio *bp) 2032 2035 { 2033 - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; 2036 + struct btrfsic_block *block = bp->bi_private; 2034 2037 int iodone_w_error; 2035 2038 2036 2039 /* mutex is not held! This is not save if IO is not yet completed ··· 2632 2635 &btrfsic_dev_state_hashtable); 2633 2636 } 2634 2637 2635 - static void __btrfsic_submit_bio(struct bio *bio) 2638 + static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) 2639 + { 2640 + unsigned int segs = bio_segments(bio); 2641 + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; 2642 + u64 cur_bytenr = dev_bytenr; 2643 + struct bvec_iter iter; 2644 + struct bio_vec bvec; 2645 + char **mapped_datav; 2646 + int bio_is_patched = 0; 2647 + int i = 0; 2648 + 2649 + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 2650 + pr_info( 2651 + "submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", 2652 + bio_op(bio), bio->bi_opf, segs, 2653 + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); 2654 + 2655 + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); 2656 + if (!mapped_datav) 2657 + return; 2658 + 2659 + bio_for_each_segment(bvec, bio, iter) { 2660 + BUG_ON(bvec.bv_len != PAGE_SIZE); 2661 + mapped_datav[i] = page_address(bvec.bv_page); 2662 + i++; 2663 + 2664 + if (dev_state->state->print_mask & 2665 + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) 2666 + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", 2667 + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); 2668 + cur_bytenr += bvec.bv_len; 2669 + } 2670 + 2671 + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, 2672 + bio, &bio_is_patched, bio->bi_opf); 2673 + kfree(mapped_datav); 2674 + } 2675 + 2676 + static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) 2677 + { 2678 + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 2679 + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", 2680 + bio_op(bio), bio->bi_opf, bio->bi_bdev); 2681 + 2682 + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 2683 + struct btrfsic_block *const block = 2684 + &dev_state->dummy_block_for_bio_bh_flush; 2685 + 2686 + block->is_iodone = 0; 2687 + block->never_written = 0; 2688 + block->iodone_w_error = 0; 2689 + block->flush_gen = dev_state->last_flush_gen + 1; 2690 + block->submit_bio_bh_rw = bio->bi_opf; 2691 + block->orig_bio_private = bio->bi_private; 2692 + block->orig_bio_end_io = bio->bi_end_io; 2693 + block->next_in_same_bio = NULL; 2694 + bio->bi_private = block; 2695 + bio->bi_end_io = btrfsic_bio_end_io; 2696 + } else if ((dev_state->state->print_mask & 2697 + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 2698 + BTRFSIC_PRINT_MASK_VERBOSE))) { 2699 + pr_info( 2700 + "btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", 2701 + dev_state->bdev); 2702 + } 2703 + } 2704 + 2705 + void btrfsic_check_bio(struct bio *bio) 2636 2706 { 2637 2707 struct btrfsic_dev_state *dev_state; 2638 2708 2639 2709 if (!btrfsic_is_initialized) 2640 2710 return; 2641 2711 2642 - mutex_lock(&btrfsic_mutex); 2643 - /* since btrfsic_submit_bio() is also called before 2644 - * btrfsic_mount(), this might return NULL */ 2712 + /* 2713 + * We can be called before btrfsic_mount, so there might not be a 2714 + * dev_state. 2715 + */ 2645 2716 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); 2646 - if (NULL != dev_state && 2647 - (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { 2648 - int i = 0; 2649 - u64 dev_bytenr; 2650 - u64 cur_bytenr; 2651 - struct bio_vec bvec; 2652 - struct bvec_iter iter; 2653 - int bio_is_patched; 2654 - char **mapped_datav; 2655 - unsigned int segs = bio_segments(bio); 2656 - 2657 - dev_bytenr = 512 * bio->bi_iter.bi_sector; 2658 - bio_is_patched = 0; 2659 - if (dev_state->state->print_mask & 2660 - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 2661 - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", 2662 - bio_op(bio), bio->bi_opf, segs, 2663 - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); 2664 - 2665 - mapped_datav = kmalloc_array(segs, 2666 - sizeof(*mapped_datav), GFP_NOFS); 2667 - if (!mapped_datav) 2668 - goto leave; 2669 - cur_bytenr = dev_bytenr; 2670 - 2671 - bio_for_each_segment(bvec, bio, iter) { 2672 - BUG_ON(bvec.bv_len != PAGE_SIZE); 2673 - mapped_datav[i] = page_address(bvec.bv_page); 2674 - i++; 2675 - 2676 - if (dev_state->state->print_mask & 2677 - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) 2678 - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", 2679 - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); 2680 - cur_bytenr += bvec.bv_len; 2681 - } 2682 - btrfsic_process_written_block(dev_state, dev_bytenr, 2683 - mapped_datav, segs, 2684 - bio, &bio_is_patched, 2685 - bio->bi_opf); 2686 - kfree(mapped_datav); 2687 - } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { 2688 - if (dev_state->state->print_mask & 2689 - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 2690 - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", 2691 - bio_op(bio), bio->bi_opf, bio->bi_bdev); 2692 - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 2693 - if ((dev_state->state->print_mask & 2694 - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 2695 - BTRFSIC_PRINT_MASK_VERBOSE))) 2696 - pr_info( 2697 - "btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", 2698 - dev_state->bdev); 2699 - } else { 2700 - struct btrfsic_block *const block = 2701 - &dev_state->dummy_block_for_bio_bh_flush; 2702 - 2703 - block->is_iodone = 0; 2704 - block->never_written = 0; 2705 - block->iodone_w_error = 0; 2706 - block->flush_gen = dev_state->last_flush_gen + 1; 2707 - block->submit_bio_bh_rw = bio->bi_opf; 2708 - block->orig_bio_private = bio->bi_private; 2709 - block->orig_bio_end_io = bio->bi_end_io; 2710 - block->next_in_same_bio = NULL; 2711 - bio->bi_private = block; 2712 - bio->bi_end_io = btrfsic_bio_end_io; 2713 - } 2717 + mutex_lock(&btrfsic_mutex); 2718 + if (dev_state) { 2719 + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) 2720 + btrfsic_check_write_bio(bio, dev_state); 2721 + else if (bio->bi_opf & REQ_PREFLUSH) 2722 + btrfsic_check_flush_bio(bio, dev_state); 2714 2723 } 2715 - leave: 2716 2724 mutex_unlock(&btrfsic_mutex); 2717 - } 2718 - 2719 - void btrfsic_submit_bio(struct bio *bio) 2720 - { 2721 - __btrfsic_submit_bio(bio); 2722 - submit_bio(bio); 2723 - } 2724 - 2725 - int btrfsic_submit_bio_wait(struct bio *bio) 2726 - { 2727 - __btrfsic_submit_bio(bio); 2728 - return submit_bio_wait(bio); 2729 2725 } 2730 2726 2731 2727 int btrfsic_mount(struct btrfs_fs_info *fs_info,

+2 -4

fs/btrfs/check-integrity.h

··· 7 7 #define BTRFS_CHECK_INTEGRITY_H 8 8 9 9 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 10 - void btrfsic_submit_bio(struct bio *bio); 11 - int btrfsic_submit_bio_wait(struct bio *bio); 10 + void btrfsic_check_bio(struct bio *bio); 12 11 #else 13 - #define btrfsic_submit_bio submit_bio 14 - #define btrfsic_submit_bio_wait submit_bio_wait 12 + static inline void btrfsic_check_bio(struct bio *bio) { } 15 13 #endif 16 14 17 15 int btrfsic_mount(struct btrfs_fs_info *fs_info,

+27 -35

fs/btrfs/compression.c

··· 425 425 } 426 426 427 427 static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, 428 - struct compressed_bio *cb, 429 428 struct bio *bio, int mirror_num) 430 429 { 431 430 blk_status_t ret; ··· 603 604 goto finish_cb; 604 605 } 605 606 606 - ret = submit_compressed_bio(fs_info, cb, bio, 0); 607 + ret = submit_compressed_bio(fs_info, bio, 0); 607 608 if (ret) 608 609 goto finish_cb; 609 610 bio = NULL; ··· 801 802 * After the compressed pages are read, we copy the bytes into the 802 803 * bio we were passed and then call the bio end_io calls 803 804 */ 804 - blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 805 - int mirror_num, unsigned long bio_flags) 805 + void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 806 + int mirror_num) 806 807 { 807 808 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 808 809 struct extent_map_tree *em_tree; 809 810 struct compressed_bio *cb; 810 811 unsigned int compressed_len; 811 - unsigned int nr_pages; 812 - unsigned int pg_index; 813 812 struct bio *comp_bio = NULL; 814 813 const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; 815 814 u64 cur_disk_byte = disk_bytenr; ··· 817 820 u64 em_start; 818 821 struct extent_map *em; 819 822 blk_status_t ret; 820 - int faili = 0; 823 + int ret2; 824 + int i; 821 825 u8 *sums; 822 826 823 827 em_tree = &BTRFS_I(inode)->extent_tree; ··· 853 855 em_len = em->len; 854 856 em_start = em->start; 855 857 858 + cb->len = bio->bi_iter.bi_size; 859 + cb->compressed_len = compressed_len; 860 + cb->compress_type = em->compress_type; 861 + cb->orig_bio = bio; 862 + 856 863 free_extent_map(em); 857 864 em = NULL; 858 865 859 - cb->len = bio->bi_iter.bi_size; 860 - cb->compressed_len = compressed_len; 861 - cb->compress_type = extent_compress_type(bio_flags); 862 - cb->orig_bio = bio; 863 - 864 - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 865 - cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), 866 - GFP_NOFS); 866 + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 867 + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); 867 868 if (!cb->compressed_pages) { 868 869 ret = BLK_STS_RESOURCE; 869 - goto fail1; 870 + goto fail; 870 871 } 871 872 872 - for (pg_index = 0; pg_index < nr_pages; pg_index++) { 873 - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); 874 - if (!cb->compressed_pages[pg_index]) { 875 - faili = pg_index - 1; 876 - ret = BLK_STS_RESOURCE; 877 - goto fail2; 878 - } 873 + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); 874 + if (ret2) { 875 + ret = BLK_STS_RESOURCE; 876 + goto fail; 879 877 } 880 - faili = nr_pages - 1; 881 - cb->nr_pages = nr_pages; 882 878 883 879 add_ra_bio_pages(inode, em_start + em_len, cb); 884 880 ··· 941 949 fs_info->sectorsize); 942 950 sums += fs_info->csum_size * nr_sectors; 943 951 944 - ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); 952 + ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); 945 953 if (ret) 946 954 goto finish_cb; 947 955 comp_bio = NULL; 948 956 } 949 957 } 950 - return BLK_STS_OK; 958 + return; 951 959 952 - fail2: 953 - while (faili >= 0) { 954 - __free_page(cb->compressed_pages[faili]); 955 - faili--; 960 + fail: 961 + if (cb->compressed_pages) { 962 + for (i = 0; i < cb->nr_pages; i++) { 963 + if (cb->compressed_pages[i]) 964 + __free_page(cb->compressed_pages[i]); 965 + } 956 966 } 957 967 958 968 kfree(cb->compressed_pages); 959 - fail1: 960 969 kfree(cb); 961 970 out: 962 971 free_extent_map(em); 963 972 bio->bi_status = ret; 964 973 bio_endio(bio); 965 - return ret; 974 + return; 966 975 finish_cb: 967 976 if (comp_bio) { 968 977 comp_bio->bi_status = ret; ··· 971 978 } 972 979 /* All bytes of @cb is submitted, endio will free @cb */ 973 980 if (cur_disk_byte == disk_bytenr + compressed_len) 974 - return ret; 981 + return; 975 982 976 983 wait_var_event(cb, refcount_read(&cb->pending_sectors) == 977 984 (disk_bytenr + compressed_len - cur_disk_byte) >> ··· 983 990 ASSERT(refcount_read(&cb->pending_sectors)); 984 991 /* Now we are the only one referring @cb, can finish it safely. */ 985 992 finish_compressed_bio_read(cb); 986 - return ret; 987 993 } 988 994 989 995 /*

+2 -2

fs/btrfs/compression.h

··· 102 102 unsigned int write_flags, 103 103 struct cgroup_subsys_state *blkcg_css, 104 104 bool writeback); 105 - blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 106 - int mirror_num, unsigned long bio_flags); 105 + void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 106 + int mirror_num); 107 107 108 108 unsigned int btrfs_compress_str2level(unsigned int type, const char *str); 109 109

+82 -20

fs/btrfs/ctree.c

··· 16 16 #include "volumes.h" 17 17 #include "qgroup.h" 18 18 #include "tree-mod-log.h" 19 + #include "tree-checker.h" 19 20 20 21 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root 21 22 *root, struct btrfs_path *path, int level); ··· 343 342 int level = btrfs_header_level(buf); 344 343 345 344 ret = btrfs_set_disk_extent_flags(trans, buf, 346 - new_flags, level, 0); 345 + new_flags, level); 347 346 if (ret) 348 347 return ret; 349 348 } ··· 1391 1390 } 1392 1391 1393 1392 /* 1394 - * helper function for btrfs_search_slot. The goal is to find a block 1395 - * in cache without setting the path to blocking. If we find the block 1396 - * we return zero and the path is unchanged. 1393 + * Helper function for btrfs_search_slot() and other functions that do a search 1394 + * on a btree. The goal is to find a tree block in the cache (the radix tree at 1395 + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read 1396 + * its pages from disk. 1397 1397 * 1398 - * If we can't find the block, we set the path blocking and do some 1399 - * reada. -EAGAIN is returned and the search must be repeated. 1398 + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the 1399 + * whole btree search, starting again from the current root node. 1400 1400 */ 1401 1401 static int 1402 1402 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, ··· 1411 1409 struct btrfs_key first_key; 1412 1410 int ret; 1413 1411 int parent_level; 1412 + bool unlock_up; 1414 1413 1414 + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); 1415 1415 blocknr = btrfs_node_blockptr(*eb_ret, slot); 1416 1416 gen = btrfs_node_ptr_generation(*eb_ret, slot); 1417 1417 parent_level = btrfs_header_level(*eb_ret); 1418 1418 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); 1419 1419 1420 + /* 1421 + * If we need to read an extent buffer from disk and we are holding locks 1422 + * on upper level nodes, we unlock all the upper nodes before reading the 1423 + * extent buffer, and then return -EAGAIN to the caller as it needs to 1424 + * restart the search. We don't release the lock on the current level 1425 + * because we need to walk this node to figure out which blocks to read. 1426 + */ 1420 1427 tmp = find_extent_buffer(fs_info, blocknr); 1421 1428 if (tmp) { 1422 1429 if (p->reada == READA_FORWARD_ALWAYS) ··· 1447 1436 return 0; 1448 1437 } 1449 1438 1439 + if (unlock_up) 1440 + btrfs_unlock_up_safe(p, level + 1); 1441 + 1450 1442 /* now we're allowed to do a blocking uptodate check */ 1451 - ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); 1443 + ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); 1452 1444 if (ret) { 1453 1445 free_extent_buffer(tmp); 1454 1446 btrfs_release_path(p); 1455 1447 return -EIO; 1456 1448 } 1457 - *eb_ret = tmp; 1458 - return 0; 1449 + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { 1450 + free_extent_buffer(tmp); 1451 + btrfs_release_path(p); 1452 + return -EUCLEAN; 1453 + } 1454 + 1455 + if (unlock_up) 1456 + ret = -EAGAIN; 1457 + 1458 + goto out; 1459 1459 } 1460 1460 1461 - /* 1462 - * reduce lock contention at high levels 1463 - * of the btree by dropping locks before 1464 - * we read. Don't release the lock on the current 1465 - * level because we need to walk this node to figure 1466 - * out which blocks to read. 1467 - */ 1468 - btrfs_unlock_up_safe(p, level + 1); 1461 + if (unlock_up) { 1462 + btrfs_unlock_up_safe(p, level + 1); 1463 + ret = -EAGAIN; 1464 + } else { 1465 + ret = 0; 1466 + } 1469 1467 1470 1468 if (p->reada != READA_NONE) 1471 1469 reada_for_search(fs_info, p, level, slot, key->objectid); 1472 1470 1473 - ret = -EAGAIN; 1474 1471 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, 1475 1472 gen, parent_level - 1, &first_key); 1476 1473 if (IS_ERR(tmp)) { ··· 1493 1474 */ 1494 1475 if (!extent_buffer_uptodate(tmp)) 1495 1476 ret = -EIO; 1496 - free_extent_buffer(tmp); 1497 1477 1498 - btrfs_release_path(p); 1478 + out: 1479 + if (ret == 0) { 1480 + *eb_ret = tmp; 1481 + } else { 1482 + free_extent_buffer(tmp); 1483 + btrfs_release_path(p); 1484 + } 1485 + 1499 1486 return ret; 1500 1487 } 1501 1488 ··· 2302 2277 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); 2303 2278 2304 2279 return ret; 2280 + } 2281 + 2282 + /** 2283 + * Search for a valid slot for the given path. 2284 + * 2285 + * @root: The root node of the tree. 2286 + * @key: Will contain a valid item if found. 2287 + * @path: The starting point to validate the slot. 2288 + * 2289 + * Return: 0 if the item is valid 2290 + * 1 if not found 2291 + * <0 if error. 2292 + */ 2293 + int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 2294 + struct btrfs_path *path) 2295 + { 2296 + while (1) { 2297 + int ret; 2298 + const int slot = path->slots[0]; 2299 + const struct extent_buffer *leaf = path->nodes[0]; 2300 + 2301 + /* This is where we start walking the path. */ 2302 + if (slot >= btrfs_header_nritems(leaf)) { 2303 + /* 2304 + * If we've reached the last slot in this leaf we need 2305 + * to go to the next leaf and reset the path. 2306 + */ 2307 + ret = btrfs_next_leaf(root, path); 2308 + if (ret) 2309 + return ret; 2310 + continue; 2311 + } 2312 + /* Store the found, valid item in @key. */ 2313 + btrfs_item_key_to_cpu(leaf, key, slot); 2314 + break; 2315 + } 2316 + return 0; 2305 2317 } 2306 2318 2307 2319 /*

+121 -44

fs/btrfs/ctree.h

··· 675 675 rwlock_t global_root_lock; 676 676 struct rb_root global_root_tree; 677 677 678 - spinlock_t fs_roots_radix_lock; 679 - struct radix_tree_root fs_roots_radix; 678 + /* The xarray that holds all the FS roots */ 679 + spinlock_t fs_roots_lock; 680 + struct xarray fs_roots; 680 681 681 682 /* block group cache stuff */ 682 - spinlock_t block_group_cache_lock; 683 - u64 first_logical_byte; 684 - struct rb_root block_group_cache_tree; 683 + rwlock_t block_group_cache_lock; 684 + struct rb_root_cached block_group_cache_tree; 685 685 686 686 /* keep track of unallocated space */ 687 687 atomic64_t free_chunk_space; ··· 848 848 * two 849 849 */ 850 850 struct btrfs_workqueue *workers; 851 + struct btrfs_workqueue *hipri_workers; 851 852 struct btrfs_workqueue *delalloc_workers; 852 853 struct btrfs_workqueue *flush_workers; 853 854 struct btrfs_workqueue *endio_workers; 854 855 struct btrfs_workqueue *endio_meta_workers; 855 856 struct btrfs_workqueue *endio_raid56_workers; 856 - struct btrfs_workqueue *rmw_workers; 857 + struct workqueue_struct *rmw_workers; 857 858 struct btrfs_workqueue *endio_meta_write_workers; 858 859 struct btrfs_workqueue *endio_write_workers; 859 860 struct btrfs_workqueue *endio_freespace_worker; ··· 947 946 * running. 948 947 */ 949 948 refcount_t scrub_workers_refcnt; 950 - struct btrfs_workqueue *scrub_workers; 951 - struct btrfs_workqueue *scrub_wr_completion_workers; 952 - struct btrfs_workqueue *scrub_parity_workers; 949 + struct workqueue_struct *scrub_workers; 950 + struct workqueue_struct *scrub_wr_completion_workers; 951 + struct workqueue_struct *scrub_parity_workers; 953 952 struct btrfs_subpage_info *subpage_info; 954 953 955 954 struct btrfs_discard_ctl discard_ctl; ··· 995 994 996 995 struct btrfs_delayed_root *delayed_root; 997 996 998 - /* Extent buffer radix tree */ 997 + /* Extent buffer xarray */ 999 998 spinlock_t buffer_lock; 1000 999 /* Entries are eb->start / sectorsize */ 1001 - struct radix_tree_root buffer_radix; 1000 + struct xarray extent_buffers; 1002 1001 1003 1002 /* next backup root to be overwritten */ 1004 1003 int backup_root_index; ··· 1046 1045 * Zone size > 0 when in ZONED mode, otherwise it's used for a check 1047 1046 * if the mode is enabled 1048 1047 */ 1049 - union { 1050 - u64 zone_size; 1051 - u64 zoned; 1052 - }; 1048 + u64 zone_size; 1053 1049 1054 1050 struct mutex zoned_meta_io_lock; 1055 1051 spinlock_t treelog_bg_lock; ··· 1119 1121 */ 1120 1122 BTRFS_ROOT_SHAREABLE, 1121 1123 BTRFS_ROOT_TRACK_DIRTY, 1122 - BTRFS_ROOT_IN_RADIX, 1124 + /* The root is tracked in fs_info::fs_roots */ 1125 + BTRFS_ROOT_REGISTERED, 1123 1126 BTRFS_ROOT_ORPHAN_ITEM_INSERTED, 1124 1127 BTRFS_ROOT_DEFRAG_RUNNING, 1125 1128 BTRFS_ROOT_FORCE_COW, ··· 1224 1225 struct rb_root inode_tree; 1225 1226 1226 1227 /* 1227 - * radix tree that keeps track of delayed nodes of every inode, 1228 - * protected by inode_lock 1228 + * Xarray that keeps track of delayed nodes of every inode, protected 1229 + * by inode_lock 1229 1230 */ 1230 - struct radix_tree_root delayed_nodes_tree; 1231 + struct xarray delayed_nodes; 1231 1232 /* 1232 1233 * right now this just gets used so that a root has its own devid 1233 1234 * for stat. It may be used for more later ··· 2783 2784 u64 bytenr, u64 num_bytes); 2784 2785 int btrfs_exclude_logged_extents(struct extent_buffer *eb); 2785 2786 int btrfs_cross_ref_exist(struct btrfs_root *root, 2786 - u64 objectid, u64 offset, u64 bytenr, bool strict); 2787 + u64 objectid, u64 offset, u64 bytenr, bool strict, 2788 + struct btrfs_path *path); 2787 2789 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 2788 2790 struct btrfs_root *root, 2789 2791 u64 parent, u64 root_objectid, ··· 2811 2811 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2812 2812 struct extent_buffer *buf, int full_backref); 2813 2813 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2814 - struct extent_buffer *eb, u64 flags, 2815 - int level, int is_data); 2814 + struct extent_buffer *eb, u64 flags, int level); 2816 2815 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); 2817 2816 2818 2817 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, ··· 2891 2892 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); 2892 2893 2893 2894 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, 2894 - u64 disk_num_bytes); 2895 + u64 disk_num_bytes, bool noflush); 2895 2896 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); 2896 2897 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 2897 2898 u64 start, u64 end); ··· 3037 3038 3038 3039 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, 3039 3040 struct btrfs_path *path); 3041 + 3042 + int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, 3043 + struct btrfs_path *path); 3044 + 3045 + /* 3046 + * Search in @root for a given @key, and store the slot found in @found_key. 3047 + * 3048 + * @root: The root node of the tree. 3049 + * @key: The key we are looking for. 3050 + * @found_key: Will hold the found item. 3051 + * @path: Holds the current slot/leaf. 3052 + * @iter_ret: Contains the value returned from btrfs_search_slot or 3053 + * btrfs_get_next_valid_item, whichever was executed last. 3054 + * 3055 + * The @iter_ret is an output variable that will contain the return value of 3056 + * btrfs_search_slot, if it encountered an error, or the value returned from 3057 + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid 3058 + * slot was found, 1 if there were no more leaves, and <0 if there was an error. 3059 + * 3060 + * It's recommended to use a separate variable for iter_ret and then use it to 3061 + * set the function return value so there's no confusion of the 0/1/errno 3062 + * values stemming from btrfs_search_slot. 3063 + */ 3064 + #define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ 3065 + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ 3066 + (iter_ret) >= 0 && \ 3067 + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ 3068 + (path)->slots[0]++ \ 3069 + ) 3040 3070 3041 3071 static inline int btrfs_next_old_item(struct btrfs_root *root, 3042 3072 struct btrfs_path *p, u64 time_seq) ··· 3218 3190 int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); 3219 3191 3220 3192 /* file-item.c */ 3221 - struct btrfs_dio_private; 3222 3193 int btrfs_del_csums(struct btrfs_trans_handle *trans, 3223 3194 struct btrfs_root *root, u64 bytenr, u64 len); 3224 3195 blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); ··· 3251 3224 u64 btrfs_file_extent_end(const struct btrfs_path *path); 3252 3225 3253 3226 /* inode.c */ 3254 - blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 3255 - int mirror_num, unsigned long bio_flags); 3227 + void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 3228 + int mirror_num, enum btrfs_compression_type compress_type); 3256 3229 unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, 3257 3230 u32 bio_offset, struct page *page, 3258 3231 u64 start, u64 end); ··· 3282 3255 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 3283 3256 unsigned int extra_bits, 3284 3257 struct extent_state **cached_state); 3285 - int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3286 - struct btrfs_root *new_root, 3287 - struct btrfs_root *parent_root, 3288 - struct user_namespace *mnt_userns); 3258 + struct btrfs_new_inode_args { 3259 + /* Input */ 3260 + struct inode *dir; 3261 + struct dentry *dentry; 3262 + struct inode *inode; 3263 + bool orphan; 3264 + bool subvol; 3265 + 3266 + /* 3267 + * Output from btrfs_new_inode_prepare(), input to 3268 + * btrfs_create_new_inode(). 3269 + */ 3270 + struct posix_acl *default_acl; 3271 + struct posix_acl *acl; 3272 + }; 3273 + int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, 3274 + unsigned int *trans_num_items); 3275 + int btrfs_create_new_inode(struct btrfs_trans_handle *trans, 3276 + struct btrfs_new_inode_args *args); 3277 + void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); 3278 + struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, 3279 + struct inode *dir); 3289 3280 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, 3290 3281 unsigned *bits); 3291 3282 void btrfs_clear_delalloc_extent(struct inode *inode, ··· 3314 3269 struct extent_state *orig, u64 split); 3315 3270 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); 3316 3271 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); 3317 - int btrfs_readpage(struct file *file, struct page *page); 3318 3272 void btrfs_evict_inode(struct inode *inode); 3319 3273 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 3320 3274 struct inode *btrfs_alloc_inode(struct super_block *sb); ··· 3358 3314 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, 3359 3315 const struct btrfs_ioctl_encoded_io_args *encoded); 3360 3316 3317 + ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); 3318 + 3361 3319 extern const struct dentry_operations btrfs_dentry_operations; 3362 - extern const struct iomap_ops btrfs_dio_iomap_ops; 3363 - extern const struct iomap_dio_ops btrfs_dio_ops; 3364 3320 3365 3321 /* Inode locking type flags, by default the exclusive lock is taken */ 3366 3322 #define BTRFS_ILOCK_SHARED (1U << 0) ··· 3372 3328 void btrfs_update_inode_bytes(struct btrfs_inode *inode, 3373 3329 const u64 add_bytes, 3374 3330 const u64 del_bytes); 3331 + void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); 3375 3332 3376 3333 /* ioctl.c */ 3377 3334 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); ··· 3448 3403 { 3449 3404 } 3450 3405 3451 - #ifdef CONFIG_PRINTK 3406 + #ifdef CONFIG_PRINTK_INDEX 3407 + 3408 + #define btrfs_printk(fs_info, fmt, args...) \ 3409 + do { \ 3410 + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ 3411 + _btrfs_printk(fs_info, fmt, ##args); \ 3412 + } while (0) 3413 + 3452 3414 __printf(2, 3) 3453 3415 __cold 3454 - void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); 3416 + void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); 3417 + 3418 + #elif defined(CONFIG_PRINTK) 3419 + 3420 + #define btrfs_printk(fs_info, fmt, args...) \ 3421 + _btrfs_printk(fs_info, fmt, ##args) 3422 + 3423 + __printf(2, 3) 3424 + __cold 3425 + void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); 3426 + 3455 3427 #else 3428 + 3456 3429 #define btrfs_printk(fs_info, fmt, args...) \ 3457 3430 btrfs_no_printk(fs_info, fmt, ##args) 3458 3431 #endif ··· 3721 3658 __LINE__, (errno)); \ 3722 3659 } while (0) 3723 3660 3661 + #ifdef CONFIG_PRINTK_INDEX 3662 + 3724 3663 #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ 3725 - do { \ 3726 - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ 3727 - (errno), fmt, ##args); \ 3664 + do { \ 3665 + printk_index_subsys_emit( \ 3666 + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ 3667 + KERN_CRIT, fmt); \ 3668 + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ 3669 + (errno), fmt, ##args); \ 3728 3670 } while (0) 3671 + 3672 + #else 3673 + 3674 + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ 3675 + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ 3676 + (errno), fmt, ##args) 3677 + 3678 + #endif 3729 3679 3730 3680 #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ 3731 3681 &(fs_info)->fs_state))) ··· 3892 3816 struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); 3893 3817 int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, 3894 3818 struct posix_acl *acl, int type); 3895 - int btrfs_init_acl(struct btrfs_trans_handle *trans, 3896 - struct inode *inode, struct inode *dir); 3819 + int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, 3820 + struct posix_acl *acl, int type); 3897 3821 #else 3898 3822 #define btrfs_get_acl NULL 3899 3823 #define btrfs_set_acl NULL 3900 - static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, 3901 - struct inode *inode, struct inode *dir) 3824 + static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, 3825 + struct inode *inode, struct posix_acl *acl, 3826 + int type) 3902 3827 { 3903 - return 0; 3828 + return -EOPNOTSUPP; 3904 3829 } 3905 3830 #endif 3906 3831 ··· 4006 3929 4007 3930 static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) 4008 3931 { 4009 - return fs_info->zoned != 0; 3932 + return fs_info->zone_size > 0; 4010 3933 } 4011 3934 4012 3935 static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)

+5 -4

fs/btrfs/delalloc-space.c

··· 289 289 } 290 290 291 291 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, 292 - u64 disk_num_bytes) 292 + u64 disk_num_bytes, bool noflush) 293 293 { 294 294 struct btrfs_root *root = inode->root; 295 295 struct btrfs_fs_info *fs_info = root->fs_info; ··· 308 308 * If we have a transaction open (can happen if we call truncate_block 309 309 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 310 310 */ 311 - if (btrfs_is_free_space_inode(inode)) { 311 + if (noflush || btrfs_is_free_space_inode(inode)) { 312 312 flush = BTRFS_RESERVE_NO_FLUSH; 313 313 } else { 314 314 if (current->journal_info) ··· 333 333 */ 334 334 calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, 335 335 &meta_reserve, &qgroup_reserve); 336 - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); 336 + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, 337 + noflush); 337 338 if (ret) 338 339 return ret; 339 340 ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); ··· 457 456 ret = btrfs_check_data_free_space(inode, reserved, start, len); 458 457 if (ret < 0) 459 458 return ret; 460 - ret = btrfs_delalloc_reserve_metadata(inode, len, len); 459 + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); 461 460 if (ret < 0) { 462 461 btrfs_free_reserved_data_space(inode, *reserved, start, len); 463 462 extent_changeset_free(*reserved);

+39 -45

fs/btrfs/delayed-inode.c

··· 78 78 } 79 79 80 80 spin_lock(&root->inode_lock); 81 - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); 81 + node = xa_load(&root->delayed_nodes, ino); 82 82 83 83 if (node) { 84 84 if (btrfs_inode->delayed_node) { ··· 90 90 91 91 /* 92 92 * It's possible that we're racing into the middle of removing 93 - * this node from the radix tree. In this case, the refcount 93 + * this node from the xarray. In this case, the refcount 94 94 * was zero and it should never go back to one. Just return 95 - * NULL like it was never in the radix at all; our release 95 + * NULL like it was never in the xarray at all; our release 96 96 * function is in the process of removing it. 97 97 * 98 98 * Some implementations of refcount_inc refuse to bump the ··· 100 100 * here, refcount_inc() may decide to just WARN_ONCE() instead 101 101 * of actually bumping the refcount. 102 102 * 103 - * If this node is properly in the radix, we want to bump the 103 + * If this node is properly in the xarray, we want to bump the 104 104 * refcount twice, once for the inode and once for this get 105 105 * operation. 106 106 */ ··· 128 128 u64 ino = btrfs_ino(btrfs_inode); 129 129 int ret; 130 130 131 - again: 132 - node = btrfs_get_delayed_node(btrfs_inode); 133 - if (node) 134 - return node; 131 + do { 132 + node = btrfs_get_delayed_node(btrfs_inode); 133 + if (node) 134 + return node; 135 135 136 - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); 137 - if (!node) 138 - return ERR_PTR(-ENOMEM); 139 - btrfs_init_delayed_node(node, root, ino); 136 + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); 137 + if (!node) 138 + return ERR_PTR(-ENOMEM); 139 + btrfs_init_delayed_node(node, root, ino); 140 140 141 - /* cached in the btrfs inode and can be accessed */ 142 - refcount_set(&node->refs, 2); 141 + /* Cached in the inode and can be accessed */ 142 + refcount_set(&node->refs, 2); 143 143 144 - ret = radix_tree_preload(GFP_NOFS); 145 - if (ret) { 146 - kmem_cache_free(delayed_node_cache, node); 147 - return ERR_PTR(ret); 148 - } 149 - 150 - spin_lock(&root->inode_lock); 151 - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); 152 - if (ret == -EEXIST) { 153 - spin_unlock(&root->inode_lock); 154 - kmem_cache_free(delayed_node_cache, node); 155 - radix_tree_preload_end(); 156 - goto again; 157 - } 144 + spin_lock(&root->inode_lock); 145 + ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); 146 + if (ret) { 147 + spin_unlock(&root->inode_lock); 148 + kmem_cache_free(delayed_node_cache, node); 149 + if (ret != -EBUSY) 150 + return ERR_PTR(ret); 151 + } 152 + } while (ret); 158 153 btrfs_inode->delayed_node = node; 159 154 spin_unlock(&root->inode_lock); 160 - radix_tree_preload_end(); 161 155 162 156 return node; 163 157 } ··· 270 276 * back up. We can delete it now. 271 277 */ 272 278 ASSERT(refcount_read(&delayed_node->refs) == 0); 273 - radix_tree_delete(&root->delayed_nodes_tree, 274 - delayed_node->inode_id); 279 + xa_erase(&root->delayed_nodes, delayed_node->inode_id); 275 280 spin_unlock(&root->inode_lock); 276 281 kmem_cache_free(delayed_node_cache, delayed_node); 277 282 } ··· 1863 1870 1864 1871 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) 1865 1872 { 1866 - u64 inode_id = 0; 1873 + unsigned long index = 0; 1874 + struct btrfs_delayed_node *delayed_node; 1867 1875 struct btrfs_delayed_node *delayed_nodes[8]; 1868 - int i, n; 1869 1876 1870 1877 while (1) { 1878 + int n = 0; 1879 + 1871 1880 spin_lock(&root->inode_lock); 1872 - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, 1873 - (void **)delayed_nodes, inode_id, 1874 - ARRAY_SIZE(delayed_nodes)); 1875 - if (!n) { 1881 + if (xa_empty(&root->delayed_nodes)) { 1876 1882 spin_unlock(&root->inode_lock); 1877 - break; 1883 + return; 1878 1884 } 1879 1885 1880 - inode_id = delayed_nodes[n - 1]->inode_id + 1; 1881 - for (i = 0; i < n; i++) { 1886 + xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { 1882 1887 /* 1883 1888 * Don't increase refs in case the node is dead and 1884 1889 * about to be removed from the tree in the loop below 1885 1890 */ 1886 - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) 1887 - delayed_nodes[i] = NULL; 1891 + if (refcount_inc_not_zero(&delayed_node->refs)) { 1892 + delayed_nodes[n] = delayed_node; 1893 + n++; 1894 + } 1895 + if (n >= ARRAY_SIZE(delayed_nodes)) 1896 + break; 1888 1897 } 1898 + index++; 1889 1899 spin_unlock(&root->inode_lock); 1890 1900 1891 - for (i = 0; i < n; i++) { 1892 - if (!delayed_nodes[i]) 1893 - continue; 1901 + for (int i = 0; i < n; i++) { 1894 1902 __btrfs_kill_delayed_node(delayed_nodes[i]); 1895 1903 btrfs_release_delayed_node(delayed_nodes[i]); 1896 1904 }

+1 -3

fs/btrfs/delayed-ref.c

··· 930 930 is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); 931 931 932 932 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 933 - BUG_ON(extent_op && extent_op->is_data); 934 933 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 935 934 if (!ref) 936 935 return -ENOMEM; ··· 1102 1103 return -ENOMEM; 1103 1104 1104 1105 init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, 1105 - BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, 1106 - false); 1106 + BTRFS_UPDATE_DELAYED_HEAD, false, false); 1107 1107 head_ref->extent_op = extent_op; 1108 1108 1109 1109 delayed_refs = &trans->transaction->delayed_refs;

-1

fs/btrfs/delayed-ref.h

··· 58 58 u8 level; 59 59 bool update_key; 60 60 bool update_flags; 61 - bool is_data; 62 61 u64 flags_to_set; 63 62 }; 64 63

+13 -39

fs/btrfs/dev-replace.c

··· 474 474 struct btrfs_dev_extent *dev_extent = NULL; 475 475 struct btrfs_block_group *cache; 476 476 struct btrfs_trans_handle *trans; 477 + int iter_ret = 0; 477 478 int ret = 0; 478 479 u64 chunk_offset; 479 480 ··· 525 524 key.type = BTRFS_DEV_EXTENT_KEY; 526 525 key.offset = 0; 527 526 528 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 529 - if (ret < 0) 530 - goto free_path; 531 - if (ret > 0) { 532 - if (path->slots[0] >= 533 - btrfs_header_nritems(path->nodes[0])) { 534 - ret = btrfs_next_leaf(root, path); 535 - if (ret < 0) 536 - goto free_path; 537 - if (ret > 0) { 538 - ret = 0; 539 - goto free_path; 540 - } 541 - } else { 542 - ret = 0; 543 - } 544 - } 545 - 546 - while (1) { 527 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 547 528 struct extent_buffer *leaf = path->nodes[0]; 548 - int slot = path->slots[0]; 549 - 550 - btrfs_item_key_to_cpu(leaf, &found_key, slot); 551 529 552 530 if (found_key.objectid != src_dev->devid) 553 531 break; ··· 537 557 if (found_key.offset < key.offset) 538 558 break; 539 559 540 - dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 560 + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 541 561 542 562 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); 543 563 544 564 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 545 565 if (!cache) 546 - goto skip; 566 + continue; 547 567 548 568 spin_lock(&cache->lock); 549 569 cache->to_copy = 1; 550 570 spin_unlock(&cache->lock); 551 571 552 572 btrfs_put_block_group(cache); 553 - 554 - skip: 555 - ret = btrfs_next_item(root, path); 556 - if (ret != 0) { 557 - if (ret > 0) 558 - ret = 0; 559 - break; 560 - } 561 573 } 574 + if (iter_ret < 0) 575 + ret = iter_ret; 562 576 563 - free_path: 564 577 btrfs_free_path(path); 565 578 unlock: 566 579 mutex_unlock(&fs_info->chunk_mutex); ··· 854 881 int scrub_ret) 855 882 { 856 883 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 884 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 857 885 struct btrfs_device *tgt_device; 858 886 struct btrfs_device *src_device; 859 887 struct btrfs_root *root = fs_info->tree_root; ··· 904 930 WARN_ON(ret); 905 931 906 932 /* Prevent write_all_supers() during the finishing procedure */ 907 - mutex_lock(&fs_info->fs_devices->device_list_mutex); 933 + mutex_lock(&fs_devices->device_list_mutex); 908 934 /* Prevent new chunks being allocated on the source device */ 909 935 mutex_lock(&fs_info->chunk_mutex); 910 936 911 937 if (!list_empty(&src_device->post_commit_list)) { 912 - mutex_unlock(&fs_info->fs_devices->device_list_mutex); 938 + mutex_unlock(&fs_devices->device_list_mutex); 913 939 mutex_unlock(&fs_info->chunk_mutex); 914 940 } else { 915 941 break; ··· 946 972 error: 947 973 up_write(&dev_replace->rwsem); 948 974 mutex_unlock(&fs_info->chunk_mutex); 949 - mutex_unlock(&fs_info->fs_devices->device_list_mutex); 975 + mutex_unlock(&fs_devices->device_list_mutex); 950 976 btrfs_rm_dev_replace_blocked(fs_info); 951 977 if (tgt_device) 952 978 btrfs_destroy_dev_replace_tgtdev(tgt_device); ··· 975 1001 976 1002 btrfs_assign_next_active_device(src_device, tgt_device); 977 1003 978 - list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 979 - fs_info->fs_devices->rw_devices++; 1004 + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); 1005 + fs_devices->rw_devices++; 980 1006 981 1007 up_write(&dev_replace->rwsem); 982 1008 btrfs_rm_dev_replace_blocked(fs_info); ··· 999 1025 * belong to this filesystem. 1000 1026 */ 1001 1027 mutex_unlock(&fs_info->chunk_mutex); 1002 - mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1028 + mutex_unlock(&fs_devices->device_list_mutex); 1003 1029 1004 1030 /* replace the sysfs entry */ 1005 1031 btrfs_sysfs_remove_device(src_device);

+6 -25

fs/btrfs/dir-item.c

··· 325 325 struct btrfs_path *path, u64 dirid, 326 326 const char *name, int name_len) 327 327 { 328 - struct extent_buffer *leaf; 329 328 struct btrfs_dir_item *di; 330 329 struct btrfs_key key; 331 - u32 nritems; 332 330 int ret; 333 331 334 332 key.objectid = dirid; 335 333 key.type = BTRFS_DIR_INDEX_KEY; 336 334 key.offset = 0; 337 335 338 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 339 - if (ret < 0) 340 - return ERR_PTR(ret); 341 - 342 - leaf = path->nodes[0]; 343 - nritems = btrfs_header_nritems(leaf); 344 - 345 - while (1) { 346 - if (path->slots[0] >= nritems) { 347 - ret = btrfs_next_leaf(root, path); 348 - if (ret < 0) 349 - return ERR_PTR(ret); 350 - if (ret > 0) 351 - break; 352 - leaf = path->nodes[0]; 353 - nritems = btrfs_header_nritems(leaf); 354 - continue; 355 - } 356 - 357 - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 336 + btrfs_for_each_slot(root, &key, &key, path, ret) { 358 337 if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) 359 338 break; 360 339 ··· 341 362 name, name_len); 342 363 if (di) 343 364 return di; 344 - 345 - path->slots[0]++; 346 365 } 347 - return NULL; 366 + /* Adjust return code if the key was not found in the next leaf. */ 367 + if (ret > 0) 368 + ret = 0; 369 + 370 + return ERR_PTR(ret); 348 371 } 349 372 350 373 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,

+148 -166

fs/btrfs/disk-io.c

··· 5 5 6 6 #include <linux/fs.h> 7 7 #include <linux/blkdev.h> 8 - #include <linux/radix-tree.h> 9 8 #include <linux/writeback.h> 10 9 #include <linux/workqueue.h> 11 10 #include <linux/kthread.h> ··· 373 374 * @level: expected level, mandatory check 374 375 * @first_key: expected key of first slot, skip check if NULL 375 376 */ 376 - static int btree_read_extent_buffer_pages(struct extent_buffer *eb, 377 - u64 parent_transid, int level, 378 - struct btrfs_key *first_key) 377 + int btrfs_read_extent_buffer(struct extent_buffer *eb, 378 + u64 parent_transid, int level, 379 + struct btrfs_key *first_key) 379 380 { 380 381 struct btrfs_fs_info *fs_info = eb->fs_info; 381 382 struct extent_io_tree *io_tree; ··· 485 486 uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, 486 487 fs_info->nodesize); 487 488 488 - /* A dirty eb shouldn't disappear from buffer_radix */ 489 + /* A dirty eb shouldn't disappear from extent_buffers */ 489 490 if (WARN_ON(!eb)) 490 491 return -EUCLEAN; 491 492 ··· 518 519 u64 found_start; 519 520 struct extent_buffer *eb; 520 521 521 - if (fs_info->sectorsize < PAGE_SIZE) 522 + if (fs_info->nodesize < PAGE_SIZE) 522 523 return csum_dirty_subpage_buffers(fs_info, bvec); 523 524 524 525 eb = (struct extent_buffer *)page->private; ··· 703 704 704 705 ASSERT(page->private); 705 706 706 - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) 707 + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 707 708 return validate_subpage_buffer(page, start, end, mirror); 708 709 709 710 eb = (struct extent_buffer *)page->private; ··· 849 850 } 850 851 851 852 blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, 852 - int mirror_num, unsigned long bio_flags, 853 - u64 dio_file_offset, 853 + int mirror_num, u64 dio_file_offset, 854 854 extent_submit_bio_start_t *submit_bio_start) 855 855 { 856 856 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; ··· 872 874 async->status = 0; 873 875 874 876 if (op_is_sync(bio->bi_opf)) 875 - btrfs_set_work_high_priority(&async->work); 876 - 877 - btrfs_queue_work(fs_info->workers, &async->work); 877 + btrfs_queue_work(fs_info->hipri_workers, &async->work); 878 + else 879 + btrfs_queue_work(fs_info->workers, &async->work); 878 880 return 0; 879 881 } 880 882 ··· 918 920 return true; 919 921 } 920 922 921 - blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, 922 - int mirror_num, unsigned long bio_flags) 923 + void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) 923 924 { 924 925 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 925 926 blk_status_t ret; ··· 930 933 */ 931 934 ret = btrfs_bio_wq_end_io(fs_info, bio, 932 935 BTRFS_WQ_ENDIO_METADATA); 933 - if (ret) 934 - goto out_w_error; 935 - ret = btrfs_map_bio(fs_info, bio, mirror_num); 936 + if (!ret) 937 + ret = btrfs_map_bio(fs_info, bio, mirror_num); 936 938 } else if (!should_async_write(fs_info, BTRFS_I(inode))) { 937 939 ret = btree_csum_one_bio(bio); 938 - if (ret) 939 - goto out_w_error; 940 - ret = btrfs_map_bio(fs_info, bio, mirror_num); 940 + if (!ret) 941 + ret = btrfs_map_bio(fs_info, bio, mirror_num); 941 942 } else { 942 943 /* 943 944 * kthread helpers are used to submit writes so that 944 945 * checksumming can happen in parallel across all CPUs 945 946 */ 946 947 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, 947 - 0, btree_submit_bio_start); 948 + btree_submit_bio_start); 948 949 } 949 950 950 - if (ret) 951 - goto out_w_error; 952 - return 0; 953 - 954 - out_w_error: 955 - bio->bi_status = ret; 956 - bio_endio(bio); 957 - return ret; 951 + if (ret) { 952 + bio->bi_status = ret; 953 + bio_endio(bio); 954 + } 958 955 } 959 956 960 957 #ifdef CONFIG_MIGRATION ··· 1109 1118 if (IS_ERR(buf)) 1110 1119 return buf; 1111 1120 1112 - ret = btree_read_extent_buffer_pages(buf, parent_transid, 1113 - level, first_key); 1121 + ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); 1114 1122 if (ret) { 1115 1123 free_extent_buffer_stale(buf); 1116 1124 return ERR_PTR(ret); 1125 + } 1126 + if (btrfs_check_eb_owner(buf, owner_root)) { 1127 + free_extent_buffer_stale(buf); 1128 + return ERR_PTR(-EUCLEAN); 1117 1129 } 1118 1130 return buf; 1119 1131 ··· 1158 1164 root->nr_delalloc_inodes = 0; 1159 1165 root->nr_ordered_extents = 0; 1160 1166 root->inode_tree = RB_ROOT; 1161 - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1167 + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); 1162 1168 1163 1169 btrfs_init_root_block_rsv(root); 1164 1170 ··· 1210 1216 btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); 1211 1217 #ifdef CONFIG_BTRFS_DEBUG 1212 1218 INIT_LIST_HEAD(&root->leak_list); 1213 - spin_lock(&fs_info->fs_roots_radix_lock); 1219 + spin_lock(&fs_info->fs_roots_lock); 1214 1220 list_add_tail(&root->leak_list, &fs_info->allocated_roots); 1215 - spin_unlock(&fs_info->fs_roots_radix_lock); 1221 + spin_unlock(&fs_info->fs_roots_lock); 1216 1222 #endif 1217 1223 } 1218 1224 ··· 1557 1563 ret = -EIO; 1558 1564 goto fail; 1559 1565 } 1566 + 1567 + /* 1568 + * For real fs, and not log/reloc trees, root owner must 1569 + * match its root node owner 1570 + */ 1571 + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && 1572 + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && 1573 + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1574 + root->root_key.objectid != btrfs_header_owner(root->node)) { 1575 + btrfs_crit(fs_info, 1576 + "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", 1577 + root->root_key.objectid, root->node->start, 1578 + btrfs_header_owner(root->node), 1579 + root->root_key.objectid); 1580 + ret = -EUCLEAN; 1581 + goto fail; 1582 + } 1560 1583 root->commit_root = btrfs_root_node(root); 1561 1584 return root; 1562 1585 fail: ··· 1659 1648 { 1660 1649 struct btrfs_root *root; 1661 1650 1662 - spin_lock(&fs_info->fs_roots_radix_lock); 1663 - root = radix_tree_lookup(&fs_info->fs_roots_radix, 1664 - (unsigned long)root_id); 1651 + spin_lock(&fs_info->fs_roots_lock); 1652 + root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); 1665 1653 if (root) 1666 1654 root = btrfs_grab_root(root); 1667 - spin_unlock(&fs_info->fs_roots_radix_lock); 1655 + spin_unlock(&fs_info->fs_roots_lock); 1668 1656 return root; 1669 1657 } 1670 1658 ··· 1705 1695 { 1706 1696 int ret; 1707 1697 1708 - ret = radix_tree_preload(GFP_NOFS); 1709 - if (ret) 1710 - return ret; 1711 - 1712 - spin_lock(&fs_info->fs_roots_radix_lock); 1713 - ret = radix_tree_insert(&fs_info->fs_roots_radix, 1714 - (unsigned long)root->root_key.objectid, 1715 - root); 1698 + spin_lock(&fs_info->fs_roots_lock); 1699 + ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, 1700 + root, GFP_NOFS); 1716 1701 if (ret == 0) { 1717 1702 btrfs_grab_root(root); 1718 - set_bit(BTRFS_ROOT_IN_RADIX, &root->state); 1703 + set_bit(BTRFS_ROOT_REGISTERED, &root->state); 1719 1704 } 1720 - spin_unlock(&fs_info->fs_roots_radix_lock); 1721 - radix_tree_preload_end(); 1705 + spin_unlock(&fs_info->fs_roots_lock); 1722 1706 1723 1707 return ret; 1724 1708 } ··· 1968 1964 1969 1965 static int cleaner_kthread(void *arg) 1970 1966 { 1971 - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg; 1967 + struct btrfs_fs_info *fs_info = arg; 1972 1968 int again; 1973 1969 1974 1970 while (1) { ··· 2270 2266 { 2271 2267 btrfs_destroy_workqueue(fs_info->fixup_workers); 2272 2268 btrfs_destroy_workqueue(fs_info->delalloc_workers); 2269 + btrfs_destroy_workqueue(fs_info->hipri_workers); 2273 2270 btrfs_destroy_workqueue(fs_info->workers); 2274 2271 btrfs_destroy_workqueue(fs_info->endio_workers); 2275 2272 btrfs_destroy_workqueue(fs_info->endio_raid56_workers); 2276 - btrfs_destroy_workqueue(fs_info->rmw_workers); 2273 + if (fs_info->rmw_workers) 2274 + destroy_workqueue(fs_info->rmw_workers); 2277 2275 btrfs_destroy_workqueue(fs_info->endio_write_workers); 2278 2276 btrfs_destroy_workqueue(fs_info->endio_freespace_worker); 2279 2277 btrfs_destroy_workqueue(fs_info->delayed_workers); ··· 2342 2336 btrfs_drew_lock_destroy(&root->snapshot_lock); 2343 2337 free_root_extent_buffers(root); 2344 2338 #ifdef CONFIG_BTRFS_DEBUG 2345 - spin_lock(&root->fs_info->fs_roots_radix_lock); 2339 + spin_lock(&root->fs_info->fs_roots_lock); 2346 2340 list_del_init(&root->leak_list); 2347 - spin_unlock(&root->fs_info->fs_roots_radix_lock); 2341 + spin_unlock(&root->fs_info->fs_roots_lock); 2348 2342 #endif 2349 2343 kfree(root); 2350 2344 } ··· 2352 2346 2353 2347 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) 2354 2348 { 2355 - int ret; 2356 - struct btrfs_root *gang[8]; 2357 - int i; 2349 + struct btrfs_root *root; 2350 + unsigned long index = 0; 2358 2351 2359 2352 while (!list_empty(&fs_info->dead_roots)) { 2360 - gang[0] = list_entry(fs_info->dead_roots.next, 2361 - struct btrfs_root, root_list); 2362 - list_del(&gang[0]->root_list); 2353 + root = list_entry(fs_info->dead_roots.next, 2354 + struct btrfs_root, root_list); 2355 + list_del(&root->root_list); 2363 2356 2364 - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) 2365 - btrfs_drop_and_free_fs_root(fs_info, gang[0]); 2366 - btrfs_put_root(gang[0]); 2357 + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) 2358 + btrfs_drop_and_free_fs_root(fs_info, root); 2359 + btrfs_put_root(root); 2367 2360 } 2368 2361 2369 - while (1) { 2370 - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 2371 - (void **)gang, 0, 2372 - ARRAY_SIZE(gang)); 2373 - if (!ret) 2374 - break; 2375 - for (i = 0; i < ret; i++) 2376 - btrfs_drop_and_free_fs_root(fs_info, gang[i]); 2362 + xa_for_each(&fs_info->fs_roots, index, root) { 2363 + btrfs_drop_and_free_fs_root(fs_info, root); 2377 2364 } 2378 2365 } 2379 2366 ··· 2443 2444 unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; 2444 2445 2445 2446 fs_info->workers = 2446 - btrfs_alloc_workqueue(fs_info, "worker", 2447 + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); 2448 + fs_info->hipri_workers = 2449 + btrfs_alloc_workqueue(fs_info, "worker-high", 2447 2450 flags | WQ_HIGHPRI, max_active, 16); 2448 2451 2449 2452 fs_info->delalloc_workers = ··· 2477 2476 fs_info->endio_raid56_workers = 2478 2477 btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, 2479 2478 max_active, 4); 2480 - fs_info->rmw_workers = 2481 - btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); 2479 + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); 2482 2480 fs_info->endio_write_workers = 2483 2481 btrfs_alloc_workqueue(fs_info, "endio-write", flags, 2484 2482 max_active, 2); ··· 2492 2492 fs_info->discard_ctl.discard_workers = 2493 2493 alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); 2494 2494 2495 - if (!(fs_info->workers && fs_info->delalloc_workers && 2496 - fs_info->flush_workers && 2495 + if (!(fs_info->workers && fs_info->hipri_workers && 2496 + fs_info->delalloc_workers && fs_info->flush_workers && 2497 2497 fs_info->endio_workers && fs_info->endio_meta_workers && 2498 2498 fs_info->endio_meta_write_workers && 2499 2499 fs_info->endio_write_workers && fs_info->endio_raid56_workers && ··· 2815 2815 } 2816 2816 2817 2817 /* 2818 - * For 4K page size, we only support 4K sector size. 2819 - * For 64K page size, we support 64K and 4K sector sizes. 2818 + * We only support at most two sectorsizes: 4K and PAGE_SIZE. 2819 + * 2820 + * We can support 16K sectorsize with 64K page size without problem, 2821 + * but such sectorsize/pagesize combination doesn't make much sense. 2822 + * 4K will be our future standard, PAGE_SIZE is supported from the very 2823 + * beginning. 2820 2824 */ 2821 - if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || 2822 - (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && 2823 - sectorsize != SZ_64K))) { 2825 + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { 2824 2826 btrfs_err(fs_info, 2825 2827 "sectorsize %llu not yet supported for page size %lu", 2826 2828 sectorsize, PAGE_SIZE); ··· 3134 3132 3135 3133 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) 3136 3134 { 3137 - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 3138 - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 3135 + xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); 3136 + xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); 3139 3137 INIT_LIST_HEAD(&fs_info->trans_list); 3140 3138 INIT_LIST_HEAD(&fs_info->dead_roots); 3141 3139 INIT_LIST_HEAD(&fs_info->delayed_iputs); ··· 3143 3141 INIT_LIST_HEAD(&fs_info->caching_block_groups); 3144 3142 spin_lock_init(&fs_info->delalloc_root_lock); 3145 3143 spin_lock_init(&fs_info->trans_lock); 3146 - spin_lock_init(&fs_info->fs_roots_radix_lock); 3144 + spin_lock_init(&fs_info->fs_roots_lock); 3147 3145 spin_lock_init(&fs_info->delayed_iput_lock); 3148 3146 spin_lock_init(&fs_info->defrag_inodes_lock); 3149 3147 spin_lock_init(&fs_info->super_lock); ··· 3211 3209 btrfs_init_balance(fs_info); 3212 3210 btrfs_init_async_reclaim_work(fs_info); 3213 3211 3214 - spin_lock_init(&fs_info->block_group_cache_lock); 3215 - fs_info->block_group_cache_tree = RB_ROOT; 3216 - fs_info->first_logical_byte = (u64)-1; 3212 + rwlock_init(&fs_info->block_group_cache_lock); 3213 + fs_info->block_group_cache_tree = RB_ROOT_CACHED; 3217 3214 3218 3215 extent_io_tree_init(fs_info, &fs_info->excluded_extents, 3219 3216 IO_TREE_FS_EXCLUDED_EXTENTS, NULL); ··· 3296 3295 3297 3296 static int btrfs_uuid_rescan_kthread(void *data) 3298 3297 { 3299 - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; 3298 + struct btrfs_fs_info *fs_info = data; 3300 3299 int ret; 3301 3300 3302 3301 /* ··· 3374 3373 /* 3375 3374 * btrfs_find_orphan_roots() is responsible for finding all the dead 3376 3375 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load 3377 - * them into the fs_info->fs_roots_radix tree. This must be done before 3376 + * them into the fs_info->fs_roots. This must be done before 3378 3377 * calling btrfs_orphan_cleanup() on the tree root. If we don't do it 3379 3378 * first, then btrfs_orphan_cleanup() will delete a dead root's orphan 3380 3379 * item before the root's tree is deleted - this means that if we unmount ··· 3612 3611 ~BTRFS_FEATURE_INCOMPAT_SUPP; 3613 3612 if (features) { 3614 3613 btrfs_err(fs_info, 3615 - "cannot mount because of unsupported optional features (%llx)", 3614 + "cannot mount because of unsupported optional features (0x%llx)", 3616 3615 features); 3617 3616 err = -EINVAL; 3618 3617 goto fail_alloc; ··· 3650 3649 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 3651 3650 if (!sb_rdonly(sb) && features) { 3652 3651 btrfs_err(fs_info, 3653 - "cannot mount read-write because of unsupported optional features (%llx)", 3652 + "cannot mount read-write because of unsupported optional features (0x%llx)", 3654 3653 features); 3655 3654 err = -EINVAL; 3656 3655 goto fail_alloc; ··· 3673 3672 btrfs_warn(fs_info, 3674 3673 "read-write for sector size %u with page size %lu is experimental", 3675 3674 sectorsize, PAGE_SIZE); 3676 - if (btrfs_super_incompat_flags(fs_info->super_copy) & 3677 - BTRFS_FEATURE_INCOMPAT_RAID56) { 3678 - btrfs_err(fs_info, 3679 - "RAID56 is not yet supported for sector size %u with page size %lu", 3680 - sectorsize, PAGE_SIZE); 3681 - err = -EINVAL; 3682 - goto fail_alloc; 3683 - } 3684 3675 subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); 3685 3676 if (!subpage_info) 3686 3677 goto fail_alloc; ··· 4150 4157 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) 4151 4158 bio->bi_opf |= REQ_FUA; 4152 4159 4153 - btrfsic_submit_bio(bio); 4160 + btrfsic_check_bio(bio); 4161 + submit_bio(bio); 4154 4162 4155 4163 if (btrfs_advance_sb_log(device, i)) 4156 4164 errors++; ··· 4265 4271 init_completion(&device->flush_wait); 4266 4272 bio->bi_private = &device->flush_wait; 4267 4273 4268 - btrfsic_submit_bio(bio); 4274 + btrfsic_check_bio(bio); 4275 + submit_bio(bio); 4269 4276 set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); 4270 4277 } 4271 4278 ··· 4499 4504 { 4500 4505 bool drop_ref = false; 4501 4506 4502 - spin_lock(&fs_info->fs_roots_radix_lock); 4503 - radix_tree_delete(&fs_info->fs_roots_radix, 4504 - (unsigned long)root->root_key.objectid); 4505 - if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 4507 + spin_lock(&fs_info->fs_roots_lock); 4508 + xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); 4509 + if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) 4506 4510 drop_ref = true; 4507 - spin_unlock(&fs_info->fs_roots_radix_lock); 4511 + spin_unlock(&fs_info->fs_roots_lock); 4508 4512 4509 4513 if (BTRFS_FS_ERROR(fs_info)) { 4510 4514 ASSERT(root->log_root == NULL); ··· 4519 4525 4520 4526 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 4521 4527 { 4522 - u64 root_objectid = 0; 4523 - struct btrfs_root *gang[8]; 4524 - int i = 0; 4528 + struct btrfs_root *roots[8]; 4529 + unsigned long index = 0; 4530 + int i; 4525 4531 int err = 0; 4526 - unsigned int ret = 0; 4532 + int grabbed; 4527 4533 4528 4534 while (1) { 4529 - spin_lock(&fs_info->fs_roots_radix_lock); 4530 - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 4531 - (void **)gang, root_objectid, 4532 - ARRAY_SIZE(gang)); 4533 - if (!ret) { 4534 - spin_unlock(&fs_info->fs_roots_radix_lock); 4535 - break; 4536 - } 4537 - root_objectid = gang[ret - 1]->root_key.objectid + 1; 4535 + struct btrfs_root *root; 4538 4536 4539 - for (i = 0; i < ret; i++) { 4540 - /* Avoid to grab roots in dead_roots */ 4541 - if (btrfs_root_refs(&gang[i]->root_item) == 0) { 4542 - gang[i] = NULL; 4543 - continue; 4544 - } 4545 - /* grab all the search result for later use */ 4546 - gang[i] = btrfs_grab_root(gang[i]); 4537 + spin_lock(&fs_info->fs_roots_lock); 4538 + if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { 4539 + spin_unlock(&fs_info->fs_roots_lock); 4540 + return err; 4547 4541 } 4548 - spin_unlock(&fs_info->fs_roots_radix_lock); 4549 4542 4550 - for (i = 0; i < ret; i++) { 4551 - if (!gang[i]) 4552 - continue; 4553 - root_objectid = gang[i]->root_key.objectid; 4554 - err = btrfs_orphan_cleanup(gang[i]); 4555 - if (err) 4543 + grabbed = 0; 4544 + xa_for_each_start(&fs_info->fs_roots, index, root, index) { 4545 + /* Avoid grabbing roots in dead_roots */ 4546 + if (btrfs_root_refs(&root->root_item) > 0) 4547 + roots[grabbed++] = btrfs_grab_root(root); 4548 + if (grabbed >= ARRAY_SIZE(roots)) 4556 4549 break; 4557 - btrfs_put_root(gang[i]); 4558 4550 } 4559 - root_objectid++; 4551 + spin_unlock(&fs_info->fs_roots_lock); 4552 + 4553 + for (i = 0; i < grabbed; i++) { 4554 + if (!roots[i]) 4555 + continue; 4556 + index = roots[i]->root_key.objectid; 4557 + err = btrfs_orphan_cleanup(roots[i]); 4558 + if (err) 4559 + goto out; 4560 + btrfs_put_root(roots[i]); 4561 + } 4562 + index++; 4560 4563 } 4561 4564 4562 - /* release the uncleaned roots due to error */ 4563 - for (; i < ret; i++) { 4564 - if (gang[i]) 4565 - btrfs_put_root(gang[i]); 4565 + out: 4566 + /* Release the roots that remain uncleaned due to error */ 4567 + for (; i < grabbed; i++) { 4568 + if (roots[i]) 4569 + btrfs_put_root(roots[i]); 4566 4570 } 4567 4571 return err; 4568 4572 } ··· 4855 4863 __btrfs_btree_balance_dirty(fs_info, 0); 4856 4864 } 4857 4865 4858 - int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, 4859 - struct btrfs_key *first_key) 4860 - { 4861 - return btree_read_extent_buffer_pages(buf, parent_transid, 4862 - level, first_key); 4863 - } 4864 - 4865 4866 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) 4866 4867 { 4867 4868 /* cleanup FS via transaction */ ··· 4870 4885 4871 4886 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) 4872 4887 { 4873 - struct btrfs_root *gang[8]; 4874 - u64 root_objectid = 0; 4875 - int ret; 4888 + unsigned long index = 0; 4889 + int grabbed = 0; 4890 + struct btrfs_root *roots[8]; 4876 4891 4877 - spin_lock(&fs_info->fs_roots_radix_lock); 4878 - while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 4879 - (void **)gang, root_objectid, 4880 - ARRAY_SIZE(gang))) != 0) { 4881 - int i; 4892 + spin_lock(&fs_info->fs_roots_lock); 4893 + while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, 4894 + ULONG_MAX, 8, XA_PRESENT))) { 4895 + for (int i = 0; i < grabbed; i++) 4896 + roots[i] = btrfs_grab_root(roots[i]); 4897 + spin_unlock(&fs_info->fs_roots_lock); 4882 4898 4883 - for (i = 0; i < ret; i++) 4884 - gang[i] = btrfs_grab_root(gang[i]); 4885 - spin_unlock(&fs_info->fs_roots_radix_lock); 4886 - 4887 - for (i = 0; i < ret; i++) { 4888 - if (!gang[i]) 4899 + for (int i = 0; i < grabbed; i++) { 4900 + if (!roots[i]) 4889 4901 continue; 4890 - root_objectid = gang[i]->root_key.objectid; 4891 - btrfs_free_log(NULL, gang[i]); 4892 - btrfs_put_root(gang[i]); 4902 + index = roots[i]->root_key.objectid; 4903 + btrfs_free_log(NULL, roots[i]); 4904 + btrfs_put_root(roots[i]); 4893 4905 } 4894 - root_objectid++; 4895 - spin_lock(&fs_info->fs_roots_radix_lock); 4906 + index++; 4907 + spin_lock(&fs_info->fs_roots_lock); 4896 4908 } 4897 - spin_unlock(&fs_info->fs_roots_radix_lock); 4909 + spin_unlock(&fs_info->fs_roots_lock); 4898 4910 btrfs_free_log_root_tree(NULL, fs_info); 4899 4911 } 4900 4912

+4 -6

fs/btrfs/disk-io.h

··· 87 87 int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, 88 88 struct page *page, u64 start, u64 end, 89 89 int mirror); 90 - blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, 91 - int mirror_num, unsigned long bio_flags); 90 + void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); 92 91 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 93 92 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); 94 93 #endif ··· 119 120 void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 120 121 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 121 122 int atomic); 122 - int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, 123 - struct btrfs_key *first_key); 123 + int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, 124 + int level, struct btrfs_key *first_key); 124 125 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 125 126 enum btrfs_wq_endio_type metadata); 126 127 blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, 127 - int mirror_num, unsigned long bio_flags, 128 - u64 dio_file_offset, 128 + int mirror_num, u64 dio_file_offset, 129 129 extent_submit_bio_start_t *submit_bio_start); 130 130 blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, 131 131 int mirror_num);

+28 -33

fs/btrfs/extent-tree.c

··· 895 895 err = -ENOENT; 896 896 while (1) { 897 897 if (ptr >= end) { 898 - WARN_ON(ptr > end); 898 + if (ptr > end) { 899 + err = -EUCLEAN; 900 + btrfs_print_leaf(path->nodes[0]); 901 + btrfs_crit(fs_info, 902 + "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", 903 + path->slots[0], root_objectid, owner, offset, parent); 904 + } 899 905 break; 900 906 } 901 907 iref = (struct btrfs_extent_inline_ref *)ptr; ··· 1583 1577 u32 item_size; 1584 1578 int ret; 1585 1579 int err = 0; 1586 - int metadata = !extent_op->is_data; 1580 + int metadata = 1; 1587 1581 1588 1582 if (TRANS_ABORTED(trans)) 1589 1583 return 0; 1590 1584 1591 - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1585 + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1592 1586 metadata = 0; 1593 1587 1594 1588 path = btrfs_alloc_path(); ··· 2186 2180 2187 2181 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2188 2182 struct extent_buffer *eb, u64 flags, 2189 - int level, int is_data) 2183 + int level) 2190 2184 { 2191 2185 struct btrfs_delayed_extent_op *extent_op; 2192 2186 int ret; ··· 2198 2192 extent_op->flags_to_set = flags; 2199 2193 extent_op->update_flags = true; 2200 2194 extent_op->update_key = false; 2201 - extent_op->is_data = is_data ? true : false; 2202 2195 extent_op->level = level; 2203 2196 2204 2197 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); ··· 2362 2357 } 2363 2358 2364 2359 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 2365 - u64 bytenr, bool strict) 2360 + u64 bytenr, bool strict, struct btrfs_path *path) 2366 2361 { 2367 - struct btrfs_path *path; 2368 2362 int ret; 2369 - 2370 - path = btrfs_alloc_path(); 2371 - if (!path) 2372 - return -ENOMEM; 2373 2363 2374 2364 do { 2375 2365 ret = check_committed_ref(root, path, objectid, ··· 2376 2376 } while (ret == -EAGAIN); 2377 2377 2378 2378 out: 2379 - btrfs_free_path(path); 2379 + btrfs_release_path(path); 2380 2380 if (btrfs_is_data_reloc_root(root)) 2381 2381 WARN_ON(ret > 0); 2382 2382 return ret; ··· 2497 2497 return ret; 2498 2498 } 2499 2499 2500 - static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 2500 + static u64 first_logical_byte(struct btrfs_fs_info *fs_info) 2501 2501 { 2502 - struct btrfs_block_group *cache; 2503 - u64 bytenr; 2502 + struct rb_node *leftmost; 2503 + u64 bytenr = 0; 2504 2504 2505 - spin_lock(&fs_info->block_group_cache_lock); 2506 - bytenr = fs_info->first_logical_byte; 2507 - spin_unlock(&fs_info->block_group_cache_lock); 2505 + read_lock(&fs_info->block_group_cache_lock); 2506 + /* Get the block group with the lowest logical start address. */ 2507 + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); 2508 + if (leftmost) { 2509 + struct btrfs_block_group *bg; 2508 2510 2509 - if (bytenr < (u64)-1) 2510 - return bytenr; 2511 - 2512 - cache = btrfs_lookup_first_block_group(fs_info, search_start); 2513 - if (!cache) 2514 - return 0; 2515 - 2516 - bytenr = cache->start; 2517 - btrfs_put_block_group(cache); 2511 + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); 2512 + bytenr = bg->start; 2513 + } 2514 + read_unlock(&fs_info->block_group_cache_lock); 2518 2515 2519 2516 return bytenr; 2520 2517 } ··· 3800 3803 3801 3804 /* Check RO and no space case before trying to activate it */ 3802 3805 spin_lock(&block_group->lock); 3803 - if (block_group->ro || 3804 - block_group->alloc_offset == block_group->zone_capacity) { 3806 + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { 3805 3807 ret = 1; 3806 3808 /* 3807 3809 * May need to clear fs_info->{treelog,data_reloc}_bg. ··· 4268 4272 return ret; 4269 4273 4270 4274 ffe_ctl->search_start = max(ffe_ctl->search_start, 4271 - first_logical_byte(fs_info, 0)); 4275 + first_logical_byte(fs_info)); 4272 4276 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); 4273 4277 if (ffe_ctl->search_start == ffe_ctl->hint_byte) { 4274 4278 block_group = btrfs_lookup_block_group(fs_info, ··· 4955 4959 extent_op->flags_to_set = flags; 4956 4960 extent_op->update_key = skinny_metadata ? false : true; 4957 4961 extent_op->update_flags = true; 4958 - extent_op->is_data = false; 4959 4962 extent_op->level = level; 4960 4963 4961 4964 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ··· 5139 5144 ret = btrfs_dec_ref(trans, root, eb, 0); 5140 5145 BUG_ON(ret); /* -ENOMEM */ 5141 5146 ret = btrfs_set_disk_extent_flags(trans, eb, flag, 5142 - btrfs_header_level(eb), 0); 5147 + btrfs_header_level(eb)); 5143 5148 BUG_ON(ret); /* -ENOMEM */ 5144 5149 wc->flags[level] |= flag; 5145 5150 } ··· 5813 5818 btrfs_qgroup_convert_reserved_meta(root, INT_MAX); 5814 5819 btrfs_qgroup_free_meta_all_pertrans(root); 5815 5820 5816 - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 5821 + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) 5817 5822 btrfs_add_dropped_root(trans, root); 5818 5823 else 5819 5824 btrfs_put_root(root);

+332 -289

fs/btrfs/extent_io.c

··· 6 6 #include <linux/mm.h> 7 7 #include <linux/pagemap.h> 8 8 #include <linux/page-flags.h> 9 + #include <linux/sched/mm.h> 9 10 #include <linux/spinlock.h> 10 11 #include <linux/blkdev.h> 11 12 #include <linux/swap.h> ··· 29 28 #include "subpage.h" 30 29 #include "zoned.h" 31 30 #include "block-group.h" 31 + #include "compression.h" 32 32 33 33 static struct kmem_cache *extent_state_cache; 34 34 static struct kmem_cache *extent_buffer_cache; ··· 77 75 if (!fs_info->allocated_ebs.next) 78 76 return; 79 77 78 + WARN_ON(!list_empty(&fs_info->allocated_ebs)); 80 79 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 81 80 while (!list_empty(&fs_info->allocated_ebs)) { 82 81 eb = list_first_entry(&fs_info->allocated_ebs, ··· 138 135 struct rb_node rb_node; 139 136 }; 140 137 138 + /* 139 + * Structure to record info about the bio being assembled, and other info like 140 + * how many bytes are there before stripe/ordered extent boundary. 141 + */ 142 + struct btrfs_bio_ctrl { 143 + struct bio *bio; 144 + enum btrfs_compression_type compress_type; 145 + u32 len_to_stripe_boundary; 146 + u32 len_to_oe_boundary; 147 + }; 148 + 141 149 struct extent_page_data { 142 150 struct btrfs_bio_ctrl bio_ctrl; 143 151 /* tells writepage not to lock the state bits for this range ··· 178 164 return ret; 179 165 } 180 166 181 - int __must_check submit_one_bio(struct bio *bio, int mirror_num, 182 - unsigned long bio_flags) 167 + static void submit_one_bio(struct bio *bio, int mirror_num, 168 + enum btrfs_compression_type compress_type) 183 169 { 184 - blk_status_t ret = 0; 185 170 struct extent_io_tree *tree = bio->bi_private; 186 171 187 172 bio->bi_private = NULL; 188 173 189 174 /* Caller should ensure the bio has at least some range added */ 190 175 ASSERT(bio->bi_iter.bi_size); 191 - if (is_data_inode(tree->private_data)) 192 - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, 193 - bio_flags); 194 - else 195 - ret = btrfs_submit_metadata_bio(tree->private_data, bio, 196 - mirror_num, bio_flags); 197 176 198 - return blk_status_to_errno(ret); 177 + if (is_data_inode(tree->private_data)) 178 + btrfs_submit_data_bio(tree->private_data, bio, mirror_num, 179 + compress_type); 180 + else 181 + btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); 182 + /* 183 + * Above submission hooks will handle the error by ending the bio, 184 + * which will do the cleanup properly. So here we should not return 185 + * any error, or the caller of submit_extent_page() will do cleanup 186 + * again, causing problems. 187 + */ 199 188 } 200 189 201 190 /* Cleanup unsubmitted bios */ ··· 219 202 * Return 0 if everything is OK. 220 203 * Return <0 for error. 221 204 */ 222 - static int __must_check flush_write_bio(struct extent_page_data *epd) 205 + static void flush_write_bio(struct extent_page_data *epd) 223 206 { 224 - int ret = 0; 225 207 struct bio *bio = epd->bio_ctrl.bio; 226 208 227 209 if (bio) { 228 - ret = submit_one_bio(bio, 0, 0); 210 + submit_one_bio(bio, 0, 0); 229 211 /* 230 212 * Clean up of epd->bio is handled by its endio function. 231 213 * And endio is either triggered by successful bio execution ··· 234 218 */ 235 219 epd->bio_ctrl.bio = NULL; 236 220 } 237 - return ret; 238 221 } 239 222 240 223 int __init extent_state_cache_init(void) ··· 2318 2303 u64 length, u64 logical, struct page *page, 2319 2304 unsigned int pg_offset, int mirror_num) 2320 2305 { 2321 - struct bio *bio; 2322 2306 struct btrfs_device *dev; 2307 + struct bio_vec bvec; 2308 + struct bio bio; 2323 2309 u64 map_length = 0; 2324 2310 u64 sector; 2325 2311 struct btrfs_io_context *bioc = NULL; 2326 - int ret; 2312 + int ret = 0; 2327 2313 2328 2314 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 2329 2315 BUG_ON(!mirror_num); ··· 2332 2316 if (btrfs_repair_one_zone(fs_info, logical)) 2333 2317 return 0; 2334 2318 2335 - bio = btrfs_bio_alloc(1); 2336 - bio->bi_iter.bi_size = 0; 2337 2319 map_length = length; 2338 2320 2339 2321 /* ··· 2349 2335 */ 2350 2336 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2351 2337 &map_length, &bioc, 0); 2352 - if (ret) { 2353 - btrfs_bio_counter_dec(fs_info); 2354 - bio_put(bio); 2355 - return -EIO; 2356 - } 2338 + if (ret) 2339 + goto out_counter_dec; 2357 2340 ASSERT(bioc->mirror_num == 1); 2358 2341 } else { 2359 2342 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2360 2343 &map_length, &bioc, mirror_num); 2361 - if (ret) { 2362 - btrfs_bio_counter_dec(fs_info); 2363 - bio_put(bio); 2364 - return -EIO; 2365 - } 2344 + if (ret) 2345 + goto out_counter_dec; 2366 2346 BUG_ON(mirror_num != bioc->mirror_num); 2367 2347 } 2368 2348 2369 2349 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 2370 - bio->bi_iter.bi_sector = sector; 2371 2350 dev = bioc->stripes[bioc->mirror_num - 1].dev; 2372 2351 btrfs_put_bioc(bioc); 2352 + 2373 2353 if (!dev || !dev->bdev || 2374 2354 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2375 - btrfs_bio_counter_dec(fs_info); 2376 - bio_put(bio); 2377 - return -EIO; 2355 + ret = -EIO; 2356 + goto out_counter_dec; 2378 2357 } 2379 - bio_set_dev(bio, dev->bdev); 2380 - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2381 - bio_add_page(bio, page, length, pg_offset); 2382 2358 2383 - if (btrfsic_submit_bio_wait(bio)) { 2359 + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 2360 + bio.bi_iter.bi_sector = sector; 2361 + __bio_add_page(&bio, page, length, pg_offset); 2362 + 2363 + btrfsic_check_bio(&bio); 2364 + ret = submit_bio_wait(&bio); 2365 + if (ret) { 2384 2366 /* try to remap that extent elsewhere? */ 2385 - btrfs_bio_counter_dec(fs_info); 2386 - bio_put(bio); 2387 2367 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2388 - return -EIO; 2368 + goto out_bio_uninit; 2389 2369 } 2390 2370 2391 2371 btrfs_info_rl_in_rcu(fs_info, 2392 2372 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2393 2373 ino, start, 2394 2374 rcu_str_deref(dev->name), sector); 2375 + ret = 0; 2376 + 2377 + out_bio_uninit: 2378 + bio_uninit(&bio); 2379 + out_counter_dec: 2395 2380 btrfs_bio_counter_dec(fs_info); 2396 - bio_put(bio); 2397 - return 0; 2381 + return ret; 2398 2382 } 2399 2383 2400 2384 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) ··· 2539 2527 failrec->start = start; 2540 2528 failrec->len = sectorsize; 2541 2529 failrec->this_mirror = 0; 2542 - failrec->bio_flags = 0; 2530 + failrec->compress_type = BTRFS_COMPRESS_NONE; 2543 2531 2544 2532 read_lock(&em_tree->lock); 2545 2533 em = lookup_extent_mapping(em_tree, start, failrec->len); ··· 2563 2551 logical = em->block_start + logical; 2564 2552 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2565 2553 logical = em->block_start; 2566 - failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2567 - extent_set_compress_type(&failrec->bio_flags, em->compress_type); 2554 + failrec->compress_type = em->compress_type; 2568 2555 } 2569 2556 2570 2557 btrfs_debug(fs_info, ··· 2695 2684 * will be handled by the endio on the repair_bio, so we can't return an 2696 2685 * error here. 2697 2686 */ 2698 - submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); 2687 + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); 2699 2688 return BLK_STS_OK; 2700 2689 } 2701 2690 ··· 2721 2710 btrfs_page_set_error(fs_info, page, start, len); 2722 2711 } 2723 2712 2724 - if (fs_info->sectorsize == PAGE_SIZE) 2713 + if (!btrfs_is_subpage(fs_info, page)) 2725 2714 unlock_page(page); 2726 2715 else 2727 2716 btrfs_subpage_end_reader(fs_info, page, start, len); 2728 2717 } 2729 2718 2730 - static blk_status_t submit_read_repair(struct inode *inode, 2731 - struct bio *failed_bio, u32 bio_offset, 2732 - struct page *page, unsigned int pgoff, 2733 - u64 start, u64 end, int failed_mirror, 2734 - unsigned int error_bitmap, 2735 - submit_bio_hook_t *submit_bio_hook) 2719 + static blk_status_t submit_data_read_repair(struct inode *inode, 2720 + struct bio *failed_bio, 2721 + u32 bio_offset, struct page *page, 2722 + unsigned int pgoff, 2723 + u64 start, u64 end, 2724 + int failed_mirror, 2725 + unsigned int error_bitmap) 2736 2726 { 2737 2727 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2738 2728 const u32 sectorsize = fs_info->sectorsize; ··· 2742 2730 int i; 2743 2731 2744 2732 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2733 + 2734 + /* This repair is only for data */ 2735 + ASSERT(is_data_inode(inode)); 2745 2736 2746 2737 /* We're here because we had some read errors or csum mismatch */ 2747 2738 ASSERT(error_bitmap); ··· 2774 2759 ret = btrfs_repair_one_sector(inode, failed_bio, 2775 2760 bio_offset + offset, 2776 2761 page, pgoff + offset, start + offset, 2777 - failed_mirror, submit_bio_hook); 2762 + failed_mirror, btrfs_submit_data_bio); 2778 2763 if (!ret) { 2779 2764 /* 2780 2765 * We have submitted the read repair, the page release ··· 2958 2943 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) 2959 2944 { 2960 2945 ASSERT(PageLocked(page)); 2961 - if (fs_info->sectorsize == PAGE_SIZE) 2946 + if (!btrfs_is_subpage(fs_info, page)) 2962 2947 return; 2963 2948 2964 2949 ASSERT(PagePrivate(page)); ··· 2966 2951 } 2967 2952 2968 2953 /* 2969 - * Find extent buffer for a givne bytenr. 2954 + * Find extent buffer for a given bytenr. 2970 2955 * 2971 2956 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking 2972 2957 * in endio context. ··· 2980 2965 * For regular sectorsize, we can use page->private to grab extent 2981 2966 * buffer 2982 2967 */ 2983 - if (fs_info->sectorsize == PAGE_SIZE) { 2968 + if (fs_info->nodesize >= PAGE_SIZE) { 2984 2969 ASSERT(PagePrivate(page) && page->private); 2985 2970 return (struct extent_buffer *)page->private; 2986 2971 } 2987 2972 2988 - /* For subpage case, we need to lookup buffer radix tree */ 2989 - rcu_read_lock(); 2990 - eb = radix_tree_lookup(&fs_info->buffer_radix, 2991 - bytenr >> fs_info->sectorsize_bits); 2992 - rcu_read_unlock(); 2973 + /* For subpage case, we need to lookup extent buffer xarray */ 2974 + eb = xa_load(&fs_info->extent_buffers, 2975 + bytenr >> fs_info->sectorsize_bits); 2993 2976 ASSERT(eb); 2994 2977 return eb; 2995 2978 } ··· 3090 3077 goto readpage_ok; 3091 3078 3092 3079 /* 3093 - * btrfs_submit_read_repair() will handle all the good 3080 + * submit_data_read_repair() will handle all the good 3094 3081 * and bad sectors, we just continue to the next bvec. 3095 3082 */ 3096 - submit_read_repair(inode, bio, bio_offset, page, 3097 - start - page_offset(page), start, 3098 - end, mirror, error_bitmap, 3099 - btrfs_submit_data_bio); 3083 + submit_data_read_repair(inode, bio, bio_offset, page, 3084 + start - page_offset(page), 3085 + start, end, mirror, 3086 + error_bitmap); 3100 3087 3101 3088 ASSERT(bio_offset + len > bio_offset); 3102 3089 bio_offset += len; ··· 3145 3132 bio_put(bio); 3146 3133 } 3147 3134 3135 + /** 3136 + * Populate every free slot in a provided array with pages. 3137 + * 3138 + * @nr_pages: number of pages to allocate 3139 + * @page_array: the array to fill with pages; any existing non-null entries in 3140 + * the array will be skipped 3141 + * 3142 + * Return: 0 if all pages were able to be allocated; 3143 + * -ENOMEM otherwise, and the caller is responsible for freeing all 3144 + * non-null page pointers in the array. 3145 + */ 3146 + int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) 3147 + { 3148 + unsigned int allocated; 3149 + 3150 + for (allocated = 0; allocated < nr_pages;) { 3151 + unsigned int last = allocated; 3152 + 3153 + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); 3154 + 3155 + if (allocated == nr_pages) 3156 + return 0; 3157 + 3158 + /* 3159 + * During this iteration, no page could be allocated, even 3160 + * though alloc_pages_bulk_array() falls back to alloc_page() 3161 + * if it could not bulk-allocate. So we must be out of memory. 3162 + */ 3163 + if (allocated == last) 3164 + return -ENOMEM; 3165 + 3166 + memalloc_retry_wait(GFP_NOFS); 3167 + } 3168 + return 0; 3169 + } 3170 + 3148 3171 /* 3149 3172 * Initialize the members up to but not including 'bio'. Use after allocating a 3150 3173 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of ··· 3206 3157 return bio; 3207 3158 } 3208 3159 3209 - struct bio *btrfs_bio_clone(struct bio *bio) 3160 + struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) 3210 3161 { 3211 3162 struct btrfs_bio *bbio; 3212 3163 struct bio *new; 3213 3164 3214 3165 /* Bio allocation backed by a bioset does not fail */ 3215 - new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset); 3166 + new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); 3216 3167 bbio = btrfs_bio(new); 3217 3168 btrfs_bio_init(bbio); 3218 3169 bbio->iter = bio->bi_iter; ··· 3247 3198 * a contiguous page to the previous one 3248 3199 * @size: portion of page that we want to write 3249 3200 * @pg_offset: starting offset in the page 3250 - * @bio_flags: flags of the current bio to see if we can merge them 3201 + * @compress_type: compression type of the current bio to see if we can merge them 3251 3202 * 3252 3203 * Attempt to add a page to bio considering stripe alignment etc. 3253 3204 * ··· 3259 3210 struct page *page, 3260 3211 u64 disk_bytenr, unsigned int size, 3261 3212 unsigned int pg_offset, 3262 - unsigned long bio_flags) 3213 + enum btrfs_compression_type compress_type) 3263 3214 { 3264 3215 struct bio *bio = bio_ctrl->bio; 3265 3216 u32 bio_size = bio->bi_iter.bi_size; ··· 3271 3222 ASSERT(bio); 3272 3223 /* The limit should be calculated when bio_ctrl->bio is allocated */ 3273 3224 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); 3274 - if (bio_ctrl->bio_flags != bio_flags) 3225 + if (bio_ctrl->compress_type != compress_type) 3275 3226 return 0; 3276 3227 3277 - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) 3228 + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 3278 3229 contig = bio->bi_iter.bi_sector == sector; 3279 3230 else 3280 3231 contig = bio_end_sector(bio) == sector; ··· 3317 3268 * The split happens for real compressed bio, which happens in 3318 3269 * btrfs_submit_compressed_read/write(). 3319 3270 */ 3320 - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { 3271 + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 3321 3272 bio_ctrl->len_to_oe_boundary = U32_MAX; 3322 3273 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3323 3274 return 0; ··· 3360 3311 unsigned int opf, 3361 3312 bio_end_io_t end_io_func, 3362 3313 u64 disk_bytenr, u32 offset, u64 file_offset, 3363 - unsigned long bio_flags) 3314 + enum btrfs_compression_type compress_type) 3364 3315 { 3365 3316 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3366 3317 struct bio *bio; ··· 3371 3322 * For compressed page range, its disk_bytenr is always @disk_bytenr 3372 3323 * passed in, no matter if we have added any range into previous bio. 3373 3324 */ 3374 - if (bio_flags & EXTENT_BIO_COMPRESSED) 3325 + if (compress_type != BTRFS_COMPRESS_NONE) 3375 3326 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 3376 3327 else 3377 3328 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; 3378 3329 bio_ctrl->bio = bio; 3379 - bio_ctrl->bio_flags = bio_flags; 3330 + bio_ctrl->compress_type = compress_type; 3380 3331 bio->bi_end_io = end_io_func; 3381 3332 bio->bi_private = &inode->io_tree; 3382 3333 bio->bi_opf = opf; ··· 3435 3386 * @end_io_func: end_io callback for new bio 3436 3387 * @mirror_num: desired mirror to read/write 3437 3388 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 3438 - * @bio_flags: flags of the current bio to see if we can merge them 3389 + * @compress_type: compress type for current bio 3439 3390 */ 3440 3391 static int submit_extent_page(unsigned int opf, 3441 3392 struct writeback_control *wbc, ··· 3444 3395 size_t size, unsigned long pg_offset, 3445 3396 bio_end_io_t end_io_func, 3446 3397 int mirror_num, 3447 - unsigned long bio_flags, 3398 + enum btrfs_compression_type compress_type, 3448 3399 bool force_bio_submit) 3449 3400 { 3450 3401 int ret = 0; ··· 3456 3407 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && 3457 3408 pg_offset + size <= PAGE_SIZE); 3458 3409 if (force_bio_submit && bio_ctrl->bio) { 3459 - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); 3410 + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); 3460 3411 bio_ctrl->bio = NULL; 3461 - if (ret < 0) 3462 - return ret; 3463 3412 } 3464 3413 3465 3414 while (cur < pg_offset + size) { ··· 3469 3422 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, 3470 3423 end_io_func, disk_bytenr, offset, 3471 3424 page_offset(page) + cur, 3472 - bio_flags); 3425 + compress_type); 3473 3426 if (ret < 0) 3474 3427 return ret; 3475 3428 } ··· 3477 3430 * We must go through btrfs_bio_add_page() to ensure each 3478 3431 * page range won't cross various boundaries. 3479 3432 */ 3480 - if (bio_flags & EXTENT_BIO_COMPRESSED) 3433 + if (compress_type != BTRFS_COMPRESS_NONE) 3481 3434 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, 3482 3435 size - offset, pg_offset + offset, 3483 - bio_flags); 3436 + compress_type); 3484 3437 else 3485 3438 added = btrfs_bio_add_page(bio_ctrl, page, 3486 3439 disk_bytenr + offset, size - offset, 3487 - pg_offset + offset, bio_flags); 3440 + pg_offset + offset, compress_type); 3488 3441 3489 3442 /* Metadata page range should never be split */ 3490 3443 if (!is_data_inode(&inode->vfs_inode)) ··· 3498 3451 if (added < size - offset) { 3499 3452 /* The bio should contain some page(s) */ 3500 3453 ASSERT(bio_ctrl->bio->bi_iter.bi_size); 3501 - ret = submit_one_bio(bio_ctrl->bio, mirror_num, 3502 - bio_ctrl->bio_flags); 3454 + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); 3503 3455 bio_ctrl->bio = NULL; 3504 - if (ret < 0) 3505 - return ret; 3506 3456 } 3507 3457 cur += added; 3508 3458 } ··· 3522 3478 if (page->mapping) 3523 3479 lockdep_assert_held(&page->mapping->private_lock); 3524 3480 3525 - if (fs_info->sectorsize == PAGE_SIZE) { 3481 + if (fs_info->nodesize >= PAGE_SIZE) { 3526 3482 if (!PagePrivate(page)) 3527 3483 attach_page_private(page, eb); 3528 3484 else ··· 3557 3513 3558 3514 fs_info = btrfs_sb(page->mapping->host->i_sb); 3559 3515 3560 - if (fs_info->sectorsize < PAGE_SIZE) 3516 + if (btrfs_is_subpage(fs_info, page)) 3561 3517 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); 3562 3518 3563 3519 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); ··· 3574 3530 return; 3575 3531 3576 3532 fs_info = btrfs_sb(page->mapping->host->i_sb); 3577 - if (fs_info->sectorsize < PAGE_SIZE) 3533 + if (btrfs_is_subpage(fs_info, page)) 3578 3534 return btrfs_detach_subpage(fs_info, page); 3579 3535 3580 3536 detach_page_private(page); ··· 3613 3569 * XXX JDM: This needs looking at to ensure proper page locking 3614 3570 * return 0 on success, otherwise return error 3615 3571 */ 3616 - int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 3572 + static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 3617 3573 struct btrfs_bio_ctrl *bio_ctrl, 3618 3574 unsigned int read_flags, u64 *prev_em_start) 3619 3575 { ··· 3682 3638 BUG_ON(extent_map_end(em) <= cur); 3683 3639 BUG_ON(end < cur); 3684 3640 3685 - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 3686 - this_bio_flag |= EXTENT_BIO_COMPRESSED; 3687 - extent_set_compress_type(&this_bio_flag, 3688 - em->compress_type); 3689 - } 3641 + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3642 + this_bio_flag = em->compress_type; 3690 3643 3691 3644 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3692 3645 cur_end = min(extent_map_end(em) - 1, end); 3693 3646 iosize = ALIGN(iosize, blocksize); 3694 - if (this_bio_flag & EXTENT_BIO_COMPRESSED) 3647 + if (this_bio_flag != BTRFS_COMPRESS_NONE) 3695 3648 disk_bytenr = em->block_start; 3696 3649 else 3697 3650 disk_bytenr = em->block_start + extent_offset; ··· 3784 3743 this_bio_flag, 3785 3744 force_bio_submit); 3786 3745 if (ret) { 3787 - unlock_extent(tree, cur, cur + iosize - 1); 3788 - end_page_read(page, false, cur, iosize); 3746 + /* 3747 + * We have to unlock the remaining range, or the page 3748 + * will never be unlocked. 3749 + */ 3750 + unlock_extent(tree, cur, end); 3751 + end_page_read(page, false, cur, end + 1 - cur); 3789 3752 goto out; 3790 3753 } 3791 3754 cur = cur + iosize; 3792 3755 pg_offset += iosize; 3793 3756 } 3794 3757 out: 3758 + return ret; 3759 + } 3760 + 3761 + int btrfs_readpage(struct file *file, struct page *page) 3762 + { 3763 + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3764 + u64 start = page_offset(page); 3765 + u64 end = start + PAGE_SIZE - 1; 3766 + struct btrfs_bio_ctrl bio_ctrl = { 0 }; 3767 + int ret; 3768 + 3769 + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3770 + 3771 + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 3772 + /* 3773 + * If btrfs_do_readpage() failed we will want to submit the assembled 3774 + * bio to do the cleanup. 3775 + */ 3776 + if (bio_ctrl.bio) 3777 + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); 3795 3778 return ret; 3796 3779 } 3797 3780 ··· 3835 3770 REQ_RAHEAD, prev_em_start); 3836 3771 put_page(pages[index]); 3837 3772 } 3838 - } 3839 - 3840 - static void update_nr_written(struct writeback_control *wbc, 3841 - unsigned long nr_written) 3842 - { 3843 - wbc->nr_to_write -= nr_written; 3844 3773 } 3845 3774 3846 3775 /* ··· 3936 3877 * For regular sector size == page size case, since one page only 3937 3878 * contains one sector, we return the page offset directly. 3938 3879 */ 3939 - if (fs_info->sectorsize == PAGE_SIZE) { 3880 + if (!btrfs_is_subpage(fs_info, page)) { 3940 3881 *start = page_offset(page); 3941 3882 *end = page_offset(page) + PAGE_SIZE; 3942 3883 return; ··· 3979 3920 u64 extent_offset; 3980 3921 u64 block_start; 3981 3922 struct extent_map *em; 3923 + int saved_ret = 0; 3982 3924 int ret = 0; 3983 3925 int nr = 0; 3984 3926 u32 opf = REQ_OP_WRITE; 3985 3927 const unsigned int write_flags = wbc_to_write_flags(wbc); 3928 + bool has_error = false; 3986 3929 bool compressed; 3987 3930 3988 3931 ret = btrfs_writepage_cow_fixup(page); ··· 3999 3938 * we don't want to touch the inode after unlocking the page, 4000 3939 * so we update the mapping writeback index now 4001 3940 */ 4002 - update_nr_written(wbc, 1); 3941 + wbc->nr_to_write--; 4003 3942 4004 3943 while (cur <= end) { 4005 3944 u64 disk_bytenr; ··· 4034 3973 if (IS_ERR(em)) { 4035 3974 btrfs_page_set_error(fs_info, page, cur, end - cur + 1); 4036 3975 ret = PTR_ERR_OR_ZERO(em); 3976 + has_error = true; 3977 + if (!saved_ret) 3978 + saved_ret = ret; 4037 3979 break; 4038 3980 } 4039 3981 ··· 4100 4036 end_bio_extent_writepage, 4101 4037 0, 0, false); 4102 4038 if (ret) { 4039 + has_error = true; 4040 + if (!saved_ret) 4041 + saved_ret = ret; 4042 + 4103 4043 btrfs_page_set_error(fs_info, page, cur, iosize); 4104 4044 if (PageWriteback(page)) 4105 4045 btrfs_page_clear_writeback(fs_info, page, cur, ··· 4117 4049 * If we finish without problem, we should not only clear page dirty, 4118 4050 * but also empty subpage dirty bits 4119 4051 */ 4120 - if (!ret) 4052 + if (!has_error) 4121 4053 btrfs_page_assert_not_dirty(fs_info, page); 4054 + else 4055 + ret = saved_ret; 4122 4056 *nr_ret = nr; 4123 4057 return ret; 4124 4058 } ··· 4251 4181 4252 4182 static void end_extent_buffer_writeback(struct extent_buffer *eb) 4253 4183 { 4254 - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) 4255 - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); 4256 - 4257 4184 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4258 4185 smp_mb__after_atomic(); 4259 4186 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); ··· 4270 4203 struct extent_page_data *epd) 4271 4204 { 4272 4205 struct btrfs_fs_info *fs_info = eb->fs_info; 4273 - int i, num_pages, failed_page_nr; 4206 + int i, num_pages; 4274 4207 int flush = 0; 4275 4208 int ret = 0; 4276 4209 4277 4210 if (!btrfs_try_tree_write_lock(eb)) { 4278 - ret = flush_write_bio(epd); 4279 - if (ret < 0) 4280 - return ret; 4211 + flush_write_bio(epd); 4281 4212 flush = 1; 4282 4213 btrfs_tree_lock(eb); 4283 4214 } ··· 4285 4220 if (!epd->sync_io) 4286 4221 return 0; 4287 4222 if (!flush) { 4288 - ret = flush_write_bio(epd); 4289 - if (ret < 0) 4290 - return ret; 4223 + flush_write_bio(epd); 4291 4224 flush = 1; 4292 4225 } 4293 4226 while (1) { ··· 4323 4260 * Subpage metadata doesn't use page locking at all, so we can skip 4324 4261 * the page locking. 4325 4262 */ 4326 - if (!ret || fs_info->sectorsize < PAGE_SIZE) 4263 + if (!ret || fs_info->nodesize < PAGE_SIZE) 4327 4264 return ret; 4328 4265 4329 4266 num_pages = num_extent_pages(eb); ··· 4332 4269 4333 4270 if (!trylock_page(p)) { 4334 4271 if (!flush) { 4335 - int err; 4336 - 4337 - err = flush_write_bio(epd); 4338 - if (err < 0) { 4339 - ret = err; 4340 - failed_page_nr = i; 4341 - goto err_unlock; 4342 - } 4272 + flush_write_bio(epd); 4343 4273 flush = 1; 4344 4274 } 4345 4275 lock_page(p); 4346 4276 } 4347 4277 } 4348 4278 4349 - return ret; 4350 - err_unlock: 4351 - /* Unlock already locked pages */ 4352 - for (i = 0; i < failed_page_nr; i++) 4353 - unlock_page(eb->pages[i]); 4354 - /* 4355 - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. 4356 - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can 4357 - * be made and undo everything done before. 4358 - */ 4359 - btrfs_tree_lock(eb); 4360 - spin_lock(&eb->refs_lock); 4361 - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4362 - end_extent_buffer_writeback(eb); 4363 - spin_unlock(&eb->refs_lock); 4364 - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, 4365 - fs_info->dirty_metadata_batch); 4366 - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 4367 - btrfs_tree_unlock(eb); 4368 4279 return ret; 4369 4280 } 4370 4281 ··· 4434 4397 struct extent_buffer *eb; 4435 4398 4436 4399 rcu_read_lock(); 4437 - eb = radix_tree_lookup(&fs_info->buffer_radix, 4438 - start >> fs_info->sectorsize_bits); 4400 + eb = xa_load(&fs_info->extent_buffers, 4401 + start >> fs_info->sectorsize_bits); 4439 4402 if (eb && atomic_inc_not_zero(&eb->refs)) { 4440 4403 rcu_read_unlock(); 4441 4404 return eb; ··· 4457 4420 struct bvec_iter_all iter_all; 4458 4421 4459 4422 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); 4460 - ASSERT(fs_info->sectorsize < PAGE_SIZE); 4423 + ASSERT(fs_info->nodesize < PAGE_SIZE); 4461 4424 4462 4425 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4463 4426 bio_for_each_segment_all(bvec, bio, iter_all) { ··· 4609 4572 * dirty anymore, we have submitted a page. Update nr_written in wbc. 4610 4573 */ 4611 4574 if (no_dirty_ebs) 4612 - update_nr_written(wbc, 1); 4575 + wbc->nr_to_write--; 4613 4576 return ret; 4614 4577 } 4615 4578 ··· 4645 4608 break; 4646 4609 } 4647 4610 disk_bytenr += PAGE_SIZE; 4648 - update_nr_written(wbc, 1); 4611 + wbc->nr_to_write--; 4649 4612 unlock_page(p); 4650 4613 } 4651 4614 ··· 4784 4747 if (!PagePrivate(page)) 4785 4748 return 0; 4786 4749 4787 - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) 4750 + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 4788 4751 return submit_eb_subpage(page, wbc, epd); 4789 4752 4790 4753 spin_lock(&mapping->private_lock); ··· 4840 4803 /* 4841 4804 * Implies write in zoned mode. Mark the last eb in a block group. 4842 4805 */ 4843 - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) 4844 - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); 4806 + btrfs_schedule_zone_finish_bg(cache, eb); 4845 4807 btrfs_put_block_group(cache); 4846 4808 } 4847 4809 ret = write_one_eb(eb, wbc, epd); ··· 4959 4923 * if the fs already has error. 4960 4924 */ 4961 4925 if (!BTRFS_FS_ERROR(fs_info)) { 4962 - ret = flush_write_bio(&epd); 4926 + flush_write_bio(&epd); 4963 4927 } else { 4964 4928 ret = -EROFS; 4965 4929 end_write_bio(&epd, ret); 4966 4930 } 4967 4931 out: 4968 4932 btrfs_zoned_meta_io_unlock(fs_info); 4933 + /* 4934 + * We can get ret > 0 from submit_extent_page() indicating how many ebs 4935 + * were submitted. Reset it to 0 to avoid false alerts for the caller. 4936 + */ 4937 + if (ret > 0) 4938 + ret = 0; 4969 4939 return ret; 4970 4940 } 4971 4941 ··· 5073 5031 * tmpfs file mapping 5074 5032 */ 5075 5033 if (!trylock_page(page)) { 5076 - ret = flush_write_bio(epd); 5077 - BUG_ON(ret < 0); 5034 + flush_write_bio(epd); 5078 5035 lock_page(page); 5079 5036 } 5080 5037 ··· 5083 5042 } 5084 5043 5085 5044 if (wbc->sync_mode != WB_SYNC_NONE) { 5086 - if (PageWriteback(page)) { 5087 - ret = flush_write_bio(epd); 5088 - BUG_ON(ret < 0); 5089 - } 5045 + if (PageWriteback(page)) 5046 + flush_write_bio(epd); 5090 5047 wait_on_page_writeback(page); 5091 5048 } 5092 5049 ··· 5124 5085 * page in our current bio, and thus deadlock, so flush the 5125 5086 * write bio here. 5126 5087 */ 5127 - ret = flush_write_bio(epd); 5128 - if (!ret) 5129 - goto retry; 5088 + flush_write_bio(epd); 5089 + goto retry; 5130 5090 } 5131 5091 5132 5092 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) ··· 5151 5113 return ret; 5152 5114 } 5153 5115 5154 - ret = flush_write_bio(&epd); 5155 - ASSERT(ret <= 0); 5116 + flush_write_bio(&epd); 5156 5117 return ret; 5157 5118 } 5158 5119 ··· 5213 5176 } 5214 5177 5215 5178 if (!found_error) 5216 - ret = flush_write_bio(&epd); 5179 + flush_write_bio(&epd); 5217 5180 else 5218 5181 end_write_bio(&epd, ret); 5219 5182 ··· 5246 5209 end_write_bio(&epd, ret); 5247 5210 return ret; 5248 5211 } 5249 - ret = flush_write_bio(&epd); 5212 + flush_write_bio(&epd); 5250 5213 return ret; 5251 5214 } 5252 5215 ··· 5269 5232 if (em_cached) 5270 5233 free_extent_map(em_cached); 5271 5234 5272 - if (bio_ctrl.bio) { 5273 - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) 5274 - return; 5275 - } 5235 + if (bio_ctrl.bio) 5236 + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); 5276 5237 } 5277 5238 5278 5239 /* ··· 5839 5804 return; 5840 5805 } 5841 5806 5842 - if (fs_info->sectorsize == PAGE_SIZE) { 5807 + if (fs_info->nodesize >= PAGE_SIZE) { 5843 5808 /* 5844 5809 * We do this since we'll remove the pages after we've 5845 5810 * removed the eb from the radix tree, so we could race ··· 5946 5911 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 5947 5912 { 5948 5913 int i; 5949 - struct page *p; 5950 5914 struct extent_buffer *new; 5951 5915 int num_pages = num_extent_pages(src); 5916 + int ret; 5952 5917 5953 5918 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 5954 5919 if (new == NULL) ··· 5961 5926 */ 5962 5927 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 5963 5928 5929 + memset(new->pages, 0, sizeof(*new->pages) * num_pages); 5930 + ret = btrfs_alloc_page_array(num_pages, new->pages); 5931 + if (ret) { 5932 + btrfs_release_extent_buffer(new); 5933 + return NULL; 5934 + } 5935 + 5964 5936 for (i = 0; i < num_pages; i++) { 5965 5937 int ret; 5938 + struct page *p = new->pages[i]; 5966 5939 5967 - p = alloc_page(GFP_NOFS); 5968 - if (!p) { 5969 - btrfs_release_extent_buffer(new); 5970 - return NULL; 5971 - } 5972 5940 ret = attach_extent_buffer_page(new, p, NULL); 5973 5941 if (ret < 0) { 5974 - put_page(p); 5975 5942 btrfs_release_extent_buffer(new); 5976 5943 return NULL; 5977 5944 } 5978 5945 WARN_ON(PageDirty(p)); 5979 - new->pages[i] = p; 5980 5946 copy_page(page_address(p), page_address(src->pages[i])); 5981 5947 } 5982 5948 set_extent_buffer_uptodate(new); ··· 5991 5955 struct extent_buffer *eb; 5992 5956 int num_pages; 5993 5957 int i; 5958 + int ret; 5994 5959 5995 5960 eb = __alloc_extent_buffer(fs_info, start, len); 5996 5961 if (!eb) 5997 5962 return NULL; 5998 5963 5999 5964 num_pages = num_extent_pages(eb); 6000 - for (i = 0; i < num_pages; i++) { 6001 - int ret; 5965 + ret = btrfs_alloc_page_array(num_pages, eb->pages); 5966 + if (ret) 5967 + goto err; 6002 5968 6003 - eb->pages[i] = alloc_page(GFP_NOFS); 6004 - if (!eb->pages[i]) 6005 - goto err; 6006 - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); 5969 + for (i = 0; i < num_pages; i++) { 5970 + struct page *p = eb->pages[i]; 5971 + 5972 + ret = attach_extent_buffer_page(eb, p, NULL); 6007 5973 if (ret < 0) 6008 5974 goto err; 6009 5975 } 5976 + 6010 5977 set_extent_buffer_uptodate(eb); 6011 5978 btrfs_set_header_nritems(eb, 0); 6012 5979 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 6013 5980 6014 5981 return eb; 6015 5982 err: 6016 - for (; i > 0; i--) { 6017 - detach_extent_buffer_page(eb, eb->pages[i - 1]); 6018 - __free_page(eb->pages[i - 1]); 5983 + for (i = 0; i < num_pages; i++) { 5984 + if (eb->pages[i]) { 5985 + detach_extent_buffer_page(eb, eb->pages[i]); 5986 + __free_page(eb->pages[i]); 5987 + } 6019 5988 } 6020 5989 __free_extent_buffer(eb); 6021 5990 return NULL; ··· 6127 6086 if (!eb) 6128 6087 return ERR_PTR(-ENOMEM); 6129 6088 eb->fs_info = fs_info; 6130 - again: 6131 - ret = radix_tree_preload(GFP_NOFS); 6132 - if (ret) { 6133 - exists = ERR_PTR(ret); 6134 - goto free_eb; 6135 - } 6136 - spin_lock(&fs_info->buffer_lock); 6137 - ret = radix_tree_insert(&fs_info->buffer_radix, 6138 - start >> fs_info->sectorsize_bits, eb); 6139 - spin_unlock(&fs_info->buffer_lock); 6140 - radix_tree_preload_end(); 6141 - if (ret == -EEXIST) { 6142 - exists = find_extent_buffer(fs_info, start); 6143 - if (exists) 6089 + 6090 + do { 6091 + ret = xa_insert(&fs_info->extent_buffers, 6092 + start >> fs_info->sectorsize_bits, 6093 + eb, GFP_NOFS); 6094 + if (ret == -ENOMEM) { 6095 + exists = ERR_PTR(ret); 6144 6096 goto free_eb; 6145 - else 6146 - goto again; 6147 - } 6097 + } 6098 + if (ret == -EBUSY) { 6099 + exists = find_extent_buffer(fs_info, start); 6100 + if (exists) 6101 + goto free_eb; 6102 + } 6103 + } while (ret); 6104 + 6148 6105 check_buffer_tree_ref(eb); 6149 6106 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 6150 6107 ··· 6163 6124 * don't try to insert two ebs for the same bytenr. So here we always 6164 6125 * return NULL and just continue. 6165 6126 */ 6166 - if (fs_info->sectorsize < PAGE_SIZE) 6127 + if (fs_info->nodesize < PAGE_SIZE) 6167 6128 return NULL; 6168 6129 6169 6130 /* Page not yet attached to an extent buffer */ ··· 6185 6146 return NULL; 6186 6147 } 6187 6148 6149 + static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 6150 + { 6151 + if (!IS_ALIGNED(start, fs_info->sectorsize)) { 6152 + btrfs_err(fs_info, "bad tree block start %llu", start); 6153 + return -EINVAL; 6154 + } 6155 + 6156 + if (fs_info->nodesize < PAGE_SIZE && 6157 + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { 6158 + btrfs_err(fs_info, 6159 + "tree block crosses page boundary, start %llu nodesize %u", 6160 + start, fs_info->nodesize); 6161 + return -EINVAL; 6162 + } 6163 + if (fs_info->nodesize >= PAGE_SIZE && 6164 + !IS_ALIGNED(start, PAGE_SIZE)) { 6165 + btrfs_err(fs_info, 6166 + "tree block is not page aligned, start %llu nodesize %u", 6167 + start, fs_info->nodesize); 6168 + return -EINVAL; 6169 + } 6170 + return 0; 6171 + } 6172 + 6188 6173 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 6189 6174 u64 start, u64 owner_root, int level) 6190 6175 { ··· 6223 6160 int uptodate = 1; 6224 6161 int ret; 6225 6162 6226 - if (!IS_ALIGNED(start, fs_info->sectorsize)) { 6227 - btrfs_err(fs_info, "bad tree block start %llu", start); 6163 + if (check_eb_alignment(fs_info, start)) 6228 6164 return ERR_PTR(-EINVAL); 6229 - } 6230 6165 6231 6166 #if BITS_PER_LONG == 32 6232 6167 if (start >= MAX_LFS_FILESIZE) { ··· 6236 6175 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6237 6176 btrfs_warn_32bit_limit(fs_info); 6238 6177 #endif 6239 - 6240 - if (fs_info->sectorsize < PAGE_SIZE && 6241 - offset_in_page(start) + len > PAGE_SIZE) { 6242 - btrfs_err(fs_info, 6243 - "tree block crosses page boundary, start %llu nodesize %lu", 6244 - start, len); 6245 - return ERR_PTR(-EINVAL); 6246 - } 6247 6178 6248 6179 eb = find_extent_buffer(fs_info, start); 6249 6180 if (eb) ··· 6266 6213 * page, but it may change in the future for 16K page size 6267 6214 * support, so we still preallocate the memory in the loop. 6268 6215 */ 6269 - if (fs_info->sectorsize < PAGE_SIZE) { 6216 + if (fs_info->nodesize < PAGE_SIZE) { 6270 6217 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); 6271 6218 if (IS_ERR(prealloc)) { 6272 6219 ret = PTR_ERR(prealloc); ··· 6317 6264 } 6318 6265 if (uptodate) 6319 6266 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6320 - again: 6321 - ret = radix_tree_preload(GFP_NOFS); 6322 - if (ret) { 6323 - exists = ERR_PTR(ret); 6324 - goto free_eb; 6325 - } 6326 6267 6327 - spin_lock(&fs_info->buffer_lock); 6328 - ret = radix_tree_insert(&fs_info->buffer_radix, 6329 - start >> fs_info->sectorsize_bits, eb); 6330 - spin_unlock(&fs_info->buffer_lock); 6331 - radix_tree_preload_end(); 6332 - if (ret == -EEXIST) { 6333 - exists = find_extent_buffer(fs_info, start); 6334 - if (exists) 6268 + do { 6269 + ret = xa_insert(&fs_info->extent_buffers, 6270 + start >> fs_info->sectorsize_bits, 6271 + eb, GFP_NOFS); 6272 + if (ret == -ENOMEM) { 6273 + exists = ERR_PTR(ret); 6335 6274 goto free_eb; 6336 - else 6337 - goto again; 6338 - } 6275 + } 6276 + if (ret == -EBUSY) { 6277 + exists = find_extent_buffer(fs_info, start); 6278 + if (exists) 6279 + goto free_eb; 6280 + } 6281 + } while (ret); 6282 + 6339 6283 /* add one reference for the tree */ 6340 6284 check_buffer_tree_ref(eb); 6341 6285 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); ··· 6377 6327 6378 6328 spin_unlock(&eb->refs_lock); 6379 6329 6380 - spin_lock(&fs_info->buffer_lock); 6381 - radix_tree_delete(&fs_info->buffer_radix, 6382 - eb->start >> fs_info->sectorsize_bits); 6383 - spin_unlock(&fs_info->buffer_lock); 6330 + xa_erase(&fs_info->extent_buffers, 6331 + eb->start >> fs_info->sectorsize_bits); 6384 6332 } else { 6385 6333 spin_unlock(&eb->refs_lock); 6386 6334 } ··· 6480 6432 int num_pages; 6481 6433 struct page *page; 6482 6434 6483 - if (eb->fs_info->sectorsize < PAGE_SIZE) 6435 + if (eb->fs_info->nodesize < PAGE_SIZE) 6484 6436 return clear_subpage_extent_buffer_dirty(eb); 6485 6437 6486 6438 num_pages = num_extent_pages(eb); ··· 6512 6464 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 6513 6465 6514 6466 if (!was_dirty) { 6515 - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; 6467 + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; 6516 6468 6517 6469 /* 6518 6470 * For subpage case, we can have other extent buffers in the ··· 6552 6504 num_pages = num_extent_pages(eb); 6553 6505 for (i = 0; i < num_pages; i++) { 6554 6506 page = eb->pages[i]; 6555 - if (page) 6556 - btrfs_page_clear_uptodate(fs_info, page, 6557 - eb->start, eb->len); 6507 + if (!page) 6508 + continue; 6509 + 6510 + /* 6511 + * This is special handling for metadata subpage, as regular 6512 + * btrfs_is_subpage() can not handle cloned/dummy metadata. 6513 + */ 6514 + if (fs_info->nodesize >= PAGE_SIZE) 6515 + ClearPageUptodate(page); 6516 + else 6517 + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, 6518 + eb->len); 6558 6519 } 6559 6520 } 6560 6521 ··· 6578 6521 num_pages = num_extent_pages(eb); 6579 6522 for (i = 0; i < num_pages; i++) { 6580 6523 page = eb->pages[i]; 6581 - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); 6524 + 6525 + /* 6526 + * This is special handling for metadata subpage, as regular 6527 + * btrfs_is_subpage() can not handle cloned/dummy metadata. 6528 + */ 6529 + if (fs_info->nodesize >= PAGE_SIZE) 6530 + SetPageUptodate(page); 6531 + else 6532 + btrfs_subpage_set_uptodate(fs_info, page, eb->start, 6533 + eb->len); 6582 6534 } 6583 6535 } 6584 6536 ··· 6643 6577 atomic_dec(&eb->io_pages); 6644 6578 } 6645 6579 if (bio_ctrl.bio) { 6646 - int tmp; 6647 - 6648 - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); 6580 + submit_one_bio(bio_ctrl.bio, mirror_num, 0); 6649 6581 bio_ctrl.bio = NULL; 6650 - if (tmp < 0) 6651 - return tmp; 6652 6582 } 6653 6583 if (ret || wait != WAIT_COMPLETE) 6654 6584 return ret; ··· 6678 6616 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 6679 6617 return -EIO; 6680 6618 6681 - if (eb->fs_info->sectorsize < PAGE_SIZE) 6619 + if (eb->fs_info->nodesize < PAGE_SIZE) 6682 6620 return read_extent_buffer_subpage(eb, wait, mirror_num); 6683 6621 6684 6622 num_pages = num_extent_pages(eb); ··· 6757 6695 } 6758 6696 6759 6697 if (bio_ctrl.bio) { 6760 - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); 6698 + submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); 6761 6699 bio_ctrl.bio = NULL; 6762 - if (err) 6763 - return err; 6764 6700 } 6765 6701 6766 6702 if (ret || wait != WAIT_COMPLETE) ··· 6931 6871 * would have !PageUptodate && !PageError, as we clear PageError before 6932 6872 * reading. 6933 6873 */ 6934 - if (fs_info->sectorsize < PAGE_SIZE) { 6874 + if (fs_info->nodesize < PAGE_SIZE) { 6935 6875 bool uptodate, error; 6936 6876 6937 6877 uptodate = btrfs_subpage_test_uptodate(fs_info, page, ··· 7033 6973 7034 6974 ASSERT(dst->len == src->len); 7035 6975 7036 - if (dst->fs_info->sectorsize == PAGE_SIZE) { 6976 + if (dst->fs_info->nodesize >= PAGE_SIZE) { 7037 6977 num_pages = num_extent_pages(dst); 7038 6978 for (i = 0; i < num_pages; i++) 7039 6979 copy_page(page_address(dst->pages[i]), ··· 7042 6982 size_t src_offset = get_eb_offset_in_page(src, 0); 7043 6983 size_t dst_offset = get_eb_offset_in_page(dst, 0); 7044 6984 7045 - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); 6985 + ASSERT(src->fs_info->nodesize < PAGE_SIZE); 7046 6986 memcpy(page_address(dst->pages[0]) + dst_offset, 7047 6987 page_address(src->pages[0]) + src_offset, 7048 6988 src->len); ··· 7323 7263 } 7324 7264 } 7325 7265 7326 - #define GANG_LOOKUP_SIZE 16 7327 7266 static struct extent_buffer *get_next_extent_buffer( 7328 7267 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 7329 7268 { 7330 - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; 7331 - struct extent_buffer *found = NULL; 7269 + struct extent_buffer *eb; 7270 + unsigned long index; 7332 7271 u64 page_start = page_offset(page); 7333 - u64 cur = page_start; 7334 7272 7335 7273 ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); 7336 7274 lockdep_assert_held(&fs_info->buffer_lock); 7337 7275 7338 - while (cur < page_start + PAGE_SIZE) { 7339 - int ret; 7340 - int i; 7341 - 7342 - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, 7343 - (void **)gang, cur >> fs_info->sectorsize_bits, 7344 - min_t(unsigned int, GANG_LOOKUP_SIZE, 7345 - PAGE_SIZE / fs_info->nodesize)); 7346 - if (ret == 0) 7347 - goto out; 7348 - for (i = 0; i < ret; i++) { 7349 - /* Already beyond page end */ 7350 - if (gang[i]->start >= page_start + PAGE_SIZE) 7351 - goto out; 7352 - /* Found one */ 7353 - if (gang[i]->start >= bytenr) { 7354 - found = gang[i]; 7355 - goto out; 7356 - } 7357 - } 7358 - cur = gang[ret - 1]->start + gang[ret - 1]->len; 7276 + xa_for_each_start(&fs_info->extent_buffers, index, eb, 7277 + page_start >> fs_info->sectorsize_bits) { 7278 + if (in_range(eb->start, page_start, PAGE_SIZE)) 7279 + return eb; 7280 + else if (eb->start >= page_start + PAGE_SIZE) 7281 + /* Already beyond page end */ 7282 + return NULL; 7359 7283 } 7360 - out: 7361 - return found; 7284 + return NULL; 7362 7285 } 7363 7286 7364 7287 static int try_release_subpage_extent_buffer(struct page *page) ··· 7418 7375 { 7419 7376 struct extent_buffer *eb; 7420 7377 7421 - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) 7378 + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 7422 7379 return try_release_subpage_extent_buffer(page); 7423 7380 7424 7381 /*

+8 -39

fs/btrfs/extent_io.h

··· 7 7 #include <linux/refcount.h> 8 8 #include <linux/fiemap.h> 9 9 #include <linux/btrfs_tree.h> 10 + #include "compression.h" 10 11 #include "ulist.h" 11 - 12 - /* 13 - * flags for bio submission. The high bits indicate the compression 14 - * type for this bio 15 - */ 16 - #define EXTENT_BIO_COMPRESSED 1 17 - #define EXTENT_BIO_FLAG_SHIFT 16 18 12 19 13 enum { 20 14 EXTENT_BUFFER_UPTODATE, ··· 26 32 /* write IO error */ 27 33 EXTENT_BUFFER_WRITE_ERR, 28 34 EXTENT_BUFFER_NO_CHECK, 29 - EXTENT_BUFFER_ZONE_FINISH, 30 35 }; 31 36 32 37 /* these are flags for __process_pages_contig */ ··· 64 71 struct io_failure_record; 65 72 struct extent_io_tree; 66 73 67 - typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, 74 + typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, 68 75 int mirror_num, 69 - unsigned long bio_flags); 76 + enum btrfs_compression_type compress_type); 70 77 71 78 typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, 72 79 struct bio *bio, u64 dio_file_offset); ··· 93 100 #ifdef CONFIG_BTRFS_DEBUG 94 101 struct list_head leak_list; 95 102 #endif 96 - }; 97 - 98 - /* 99 - * Structure to record info about the bio being assembled, and other info like 100 - * how many bytes are there before stripe/ordered extent boundary. 101 - */ 102 - struct btrfs_bio_ctrl { 103 - struct bio *bio; 104 - unsigned long bio_flags; 105 - u32 len_to_stripe_boundary; 106 - u32 len_to_oe_boundary; 107 103 }; 108 104 109 105 /* ··· 140 158 kfree(changeset); 141 159 } 142 160 143 - static inline void extent_set_compress_type(unsigned long *bio_flags, 144 - int compress_type) 145 - { 146 - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; 147 - } 148 - 149 - static inline int extent_compress_type(unsigned long bio_flags) 150 - { 151 - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; 152 - } 153 - 154 161 struct extent_map_tree; 155 162 156 163 typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, ··· 149 178 int try_release_extent_mapping(struct page *page, gfp_t mask); 150 179 int try_release_extent_buffer(struct page *page); 151 180 152 - int __must_check submit_one_bio(struct bio *bio, int mirror_num, 153 - unsigned long bio_flags); 154 - int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 155 - struct btrfs_bio_ctrl *bio_ctrl, 156 - unsigned int read_flags, u64 *prev_em_start); 181 + int btrfs_readpage(struct file *file, struct page *page); 157 182 int extent_write_full_page(struct page *page, struct writeback_control *wbc); 158 183 int extent_write_locked_range(struct inode *inode, u64 start, u64 end); 159 184 int extent_writepages(struct address_space *mapping, ··· 244 277 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 245 278 struct page *locked_page, 246 279 u32 bits_to_clear, unsigned long page_ops); 280 + 281 + int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); 247 282 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); 248 - struct bio *btrfs_bio_clone(struct bio *bio); 283 + struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); 249 284 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); 250 285 251 286 void end_extent_writepage(struct page *page, int err, u64 start, u64 end); ··· 266 297 u64 start; 267 298 u64 len; 268 299 u64 logical; 269 - unsigned long bio_flags; 300 + enum btrfs_compression_type compress_type; 270 301 int this_mirror; 271 302 int failed_mirror; 272 303 };

+105 -181

fs/btrfs/file.c

··· 1460 1460 return ret; 1461 1461 } 1462 1462 1463 - static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, 1464 - size_t *write_bytes, bool nowait) 1463 + /* 1464 + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 1465 + * 1466 + * @pos: File offset. 1467 + * @write_bytes: The length to write, will be updated to the nocow writeable 1468 + * range. 1469 + * 1470 + * This function will flush ordered extents in the range to ensure proper 1471 + * nocow checks. 1472 + * 1473 + * Return: 1474 + * > 0 If we can nocow, and updates @write_bytes. 1475 + * 0 If we can't do a nocow write. 1476 + * -EAGAIN If we can't do a nocow write because snapshoting of the inode's 1477 + * root is in progress. 1478 + * < 0 If an error happened. 1479 + * 1480 + * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. 1481 + */ 1482 + int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 1483 + size_t *write_bytes) 1465 1484 { 1466 1485 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1467 1486 struct btrfs_root *root = inode->root; ··· 1491 1472 if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 1492 1473 return 0; 1493 1474 1494 - if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) 1475 + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) 1495 1476 return -EAGAIN; 1496 1477 1497 1478 lockstart = round_down(pos, fs_info->sectorsize); ··· 1499 1480 fs_info->sectorsize) - 1; 1500 1481 num_bytes = lockend - lockstart + 1; 1501 1482 1502 - if (nowait) { 1503 - struct btrfs_ordered_extent *ordered; 1504 - 1505 - if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) 1506 - return -EAGAIN; 1507 - 1508 - ordered = btrfs_lookup_ordered_range(inode, lockstart, 1509 - num_bytes); 1510 - if (ordered) { 1511 - btrfs_put_ordered_extent(ordered); 1512 - ret = -EAGAIN; 1513 - goto out_unlock; 1514 - } 1515 - } else { 1516 - btrfs_lock_and_flush_ordered_range(inode, lockstart, 1517 - lockend, NULL); 1518 - } 1519 - 1483 + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); 1520 1484 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, 1521 1485 NULL, NULL, NULL, false); 1522 1486 if (ret <= 0) { 1523 1487 ret = 0; 1524 - if (!nowait) 1525 - btrfs_drew_write_unlock(&root->snapshot_lock); 1488 + btrfs_drew_write_unlock(&root->snapshot_lock); 1526 1489 } else { 1527 1490 *write_bytes = min_t(size_t, *write_bytes , 1528 1491 num_bytes - pos + lockstart); 1529 1492 } 1530 - out_unlock: 1531 1493 unlock_extent(&inode->io_tree, lockstart, lockend); 1532 1494 1533 1495 return ret; 1534 - } 1535 - 1536 - static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, 1537 - size_t *write_bytes) 1538 - { 1539 - return check_can_nocow(inode, pos, write_bytes, true); 1540 - } 1541 - 1542 - /* 1543 - * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 1544 - * 1545 - * @pos: File offset 1546 - * @write_bytes: The length to write, will be updated to the nocow writeable 1547 - * range 1548 - * 1549 - * This function will flush ordered extents in the range to ensure proper 1550 - * nocow checks. 1551 - * 1552 - * Return: 1553 - * >0 and update @write_bytes if we can do nocow write 1554 - * 0 if we can't do nocow write 1555 - * -EAGAIN if we can't get the needed lock or there are ordered extents 1556 - * for * (nowait == true) case 1557 - * <0 if other error happened 1558 - * 1559 - * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). 1560 - */ 1561 - int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 1562 - size_t *write_bytes) 1563 - { 1564 - return check_can_nocow(inode, pos, write_bytes, false); 1565 1496 } 1566 1497 1567 1498 void btrfs_check_nocow_unlock(struct btrfs_inode *inode) ··· 1548 1579 loff_t oldsize; 1549 1580 loff_t start_pos; 1550 1581 1551 - if (iocb->ki_flags & IOCB_NOWAIT) { 1552 - size_t nocow_bytes = count; 1553 - 1554 - /* We will allocate space in case nodatacow is not set, so bail */ 1555 - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) 1556 - return -EAGAIN; 1557 - /* 1558 - * There are holes in the range or parts of the range that must 1559 - * be COWed (shared extents, RO block groups, etc), so just bail 1560 - * out. 1561 - */ 1562 - if (nocow_bytes < count) 1563 - return -EAGAIN; 1564 - } 1582 + /* 1583 + * Quickly bail out on NOWAIT writes if we don't have the nodatacow or 1584 + * prealloc flags, as without those flags we always have to COW. We will 1585 + * later check if we can really COW into the target range (using 1586 + * can_nocow_extent() at btrfs_get_blocks_direct_write()). 1587 + */ 1588 + if ((iocb->ki_flags & IOCB_NOWAIT) && 1589 + !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 1590 + return -EAGAIN; 1565 1591 1566 1592 current->backing_dev_info = inode_to_bdi(inode); 1567 1593 ret = file_remove_privs(file); ··· 1684 1720 WARN_ON(reserve_bytes == 0); 1685 1721 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 1686 1722 reserve_bytes, 1687 - reserve_bytes); 1723 + reserve_bytes, false); 1688 1724 if (ret) { 1689 1725 if (!only_release_metadata) 1690 1726 btrfs_free_reserved_data_space(BTRFS_I(inode), ··· 1929 1965 */ 1930 1966 again: 1931 1967 from->nofault = true; 1932 - err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 1933 - IOMAP_DIO_PARTIAL, written); 1968 + err = btrfs_dio_rw(iocb, from, written); 1934 1969 from->nofault = false; 1935 1970 1936 1971 /* No increment (+=) because iomap returns a cumulative value. */ ··· 2533 2570 return ret; 2534 2571 } 2535 2572 2536 - static int btrfs_punch_hole_lock_range(struct inode *inode, 2537 - const u64 lockstart, 2538 - const u64 lockend, 2539 - struct extent_state **cached_state) 2573 + static void btrfs_punch_hole_lock_range(struct inode *inode, 2574 + const u64 lockstart, 2575 + const u64 lockend, 2576 + struct extent_state **cached_state) 2540 2577 { 2541 2578 /* 2542 2579 * For subpage case, if the range is not at page boundary, we could ··· 2550 2587 const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; 2551 2588 2552 2589 while (1) { 2553 - struct btrfs_ordered_extent *ordered; 2554 - int ret; 2555 - 2556 2590 truncate_pagecache_range(inode, lockstart, lockend); 2557 2591 2558 2592 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2559 2593 cached_state); 2560 - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), 2561 - lockend); 2562 - 2563 2594 /* 2564 - * We need to make sure we have no ordered extents in this range 2565 - * and nobody raced in and read a page in this range, if we did 2566 - * we need to try again. 2595 + * We can't have ordered extents in the range, nor dirty/writeback 2596 + * pages, because we have locked the inode's VFS lock in exclusive 2597 + * mode, we have locked the inode's i_mmap_lock in exclusive mode, 2598 + * we have flushed all delalloc in the range and we have waited 2599 + * for any ordered extents in the range to complete. 2600 + * We can race with anyone reading pages from this range, so after 2601 + * locking the range check if we have pages in the range, and if 2602 + * we do, unlock the range and retry. 2567 2603 */ 2568 - if ((!ordered || 2569 - (ordered->file_offset + ordered->num_bytes <= lockstart || 2570 - ordered->file_offset > lockend)) && 2571 - !filemap_range_has_page(inode->i_mapping, 2572 - page_lockstart, page_lockend)) { 2573 - if (ordered) 2574 - btrfs_put_ordered_extent(ordered); 2604 + if (!filemap_range_has_page(inode->i_mapping, page_lockstart, 2605 + page_lockend)) 2575 2606 break; 2576 - } 2577 - if (ordered) 2578 - btrfs_put_ordered_extent(ordered); 2607 + 2579 2608 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, 2580 2609 lockend, cached_state); 2581 - ret = btrfs_wait_ordered_range(inode, lockstart, 2582 - lockend - lockstart + 1); 2583 - if (ret) 2584 - return ret; 2585 2610 } 2586 - return 0; 2611 + 2612 + btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); 2587 2613 } 2588 2614 2589 2615 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, ··· 2928 2976 bool truncated_block = false; 2929 2977 bool updated_inode = false; 2930 2978 2979 + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 2980 + 2931 2981 ret = btrfs_wait_ordered_range(inode, offset, len); 2932 2982 if (ret) 2933 - return ret; 2983 + goto out_only_mutex; 2934 2984 2935 - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 2936 2985 ino_size = round_up(inode->i_size, fs_info->sectorsize); 2937 2986 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 2938 2987 if (ret < 0) ··· 3025 3072 goto out_only_mutex; 3026 3073 } 3027 3074 3028 - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3029 - &cached_state); 3030 - if (ret) 3031 - goto out_only_mutex; 3075 + btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); 3032 3076 3033 3077 path = btrfs_alloc_path(); 3034 3078 if (!path) { ··· 3187 3237 u64 bytes_to_reserve = 0; 3188 3238 bool space_reserved = false; 3189 3239 3190 - inode_dio_wait(inode); 3191 - 3192 3240 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, 3193 3241 alloc_end - alloc_start); 3194 3242 if (IS_ERR(em)) { ··· 3316 3368 if (ret < 0) 3317 3369 goto out; 3318 3370 space_reserved = true; 3319 - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3320 - &cached_state); 3321 - if (ret) 3322 - goto out; 3371 + btrfs_punch_hole_lock_range(inode, lockstart, lockend, 3372 + &cached_state); 3323 3373 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, 3324 3374 alloc_start, bytes_to_reserve); 3325 3375 if (ret) { ··· 3363 3417 u64 alloc_hint = 0; 3364 3418 u64 locked_end; 3365 3419 u64 actual_end = 0; 3420 + u64 data_space_needed = 0; 3421 + u64 data_space_reserved = 0; 3422 + u64 qgroup_reserved = 0; 3366 3423 struct extent_map *em; 3367 3424 int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); 3368 3425 int ret; ··· 3385 3436 3386 3437 if (mode & FALLOC_FL_PUNCH_HOLE) 3387 3438 return btrfs_punch_hole(file, offset, len); 3388 - 3389 - /* 3390 - * Only trigger disk allocation, don't trigger qgroup reserve 3391 - * 3392 - * For qgroup space, it will be checked later. 3393 - */ 3394 - if (!(mode & FALLOC_FL_ZERO_RANGE)) { 3395 - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3396 - alloc_end - alloc_start); 3397 - if (ret < 0) 3398 - return ret; 3399 - } 3400 3439 3401 3440 btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); 3402 3441 ··· 3422 3485 } 3423 3486 3424 3487 /* 3425 - * wait for ordered IO before we have any locks. We'll loop again 3426 - * below with the locks held. 3488 + * We have locked the inode at the VFS level (in exclusive mode) and we 3489 + * have locked the i_mmap_lock lock (in exclusive mode). Now before 3490 + * locking the file range, flush all dealloc in the range and wait for 3491 + * all ordered extents in the range to complete. After this we can lock 3492 + * the file range and, due to the previous locking we did, we know there 3493 + * can't be more delalloc or ordered extents in the range. 3427 3494 */ 3428 3495 ret = btrfs_wait_ordered_range(inode, alloc_start, 3429 3496 alloc_end - alloc_start); ··· 3441 3500 } 3442 3501 3443 3502 locked_end = alloc_end - 1; 3444 - while (1) { 3445 - struct btrfs_ordered_extent *ordered; 3503 + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 3504 + &cached_state); 3446 3505 3447 - /* the extent lock is ordered inside the running 3448 - * transaction 3449 - */ 3450 - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, 3451 - locked_end, &cached_state); 3452 - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), 3453 - locked_end); 3454 - 3455 - if (ordered && 3456 - ordered->file_offset + ordered->num_bytes > alloc_start && 3457 - ordered->file_offset < alloc_end) { 3458 - btrfs_put_ordered_extent(ordered); 3459 - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 3460 - alloc_start, locked_end, 3461 - &cached_state); 3462 - /* 3463 - * we can't wait on the range with the transaction 3464 - * running or with the extent lock held 3465 - */ 3466 - ret = btrfs_wait_ordered_range(inode, alloc_start, 3467 - alloc_end - alloc_start); 3468 - if (ret) 3469 - goto out; 3470 - } else { 3471 - if (ordered) 3472 - btrfs_put_ordered_extent(ordered); 3473 - break; 3474 - } 3475 - } 3506 + btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); 3476 3507 3477 3508 /* First, check if we exceed the qgroup limit */ 3478 3509 INIT_LIST_HEAD(&reserve_list); ··· 3461 3548 if (em->block_start == EXTENT_MAP_HOLE || 3462 3549 (cur_offset >= inode->i_size && 3463 3550 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 3464 - ret = add_falloc_range(&reserve_list, cur_offset, 3465 - last_byte - cur_offset); 3551 + const u64 range_len = last_byte - cur_offset; 3552 + 3553 + ret = add_falloc_range(&reserve_list, cur_offset, range_len); 3466 3554 if (ret < 0) { 3467 3555 free_extent_map(em); 3468 3556 break; 3469 3557 } 3470 3558 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), 3471 - &data_reserved, cur_offset, 3472 - last_byte - cur_offset); 3559 + &data_reserved, cur_offset, range_len); 3473 3560 if (ret < 0) { 3474 - cur_offset = last_byte; 3475 3561 free_extent_map(em); 3476 3562 break; 3477 3563 } 3478 - } else { 3479 - /* 3480 - * Do not need to reserve unwritten extent for this 3481 - * range, free reserved data space first, otherwise 3482 - * it'll result in false ENOSPC error. 3483 - */ 3484 - btrfs_free_reserved_data_space(BTRFS_I(inode), 3485 - data_reserved, cur_offset, 3486 - last_byte - cur_offset); 3564 + qgroup_reserved += range_len; 3565 + data_space_needed += range_len; 3487 3566 } 3488 3567 free_extent_map(em); 3489 3568 cur_offset = last_byte; 3569 + } 3570 + 3571 + if (!ret && data_space_needed > 0) { 3572 + /* 3573 + * We are safe to reserve space here as we can't have delalloc 3574 + * in the range, see above. 3575 + */ 3576 + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 3577 + data_space_needed); 3578 + if (!ret) 3579 + data_space_reserved = data_space_needed; 3490 3580 } 3491 3581 3492 3582 /* ··· 3497 3581 * Or just cleanup the list and exit. 3498 3582 */ 3499 3583 list_for_each_entry_safe(range, tmp, &reserve_list, list) { 3500 - if (!ret) 3584 + if (!ret) { 3501 3585 ret = btrfs_prealloc_file_range(inode, mode, 3502 3586 range->start, 3503 3587 range->len, i_blocksize(inode), 3504 3588 offset + len, &alloc_hint); 3505 - else 3589 + /* 3590 + * btrfs_prealloc_file_range() releases space even 3591 + * if it returns an error. 3592 + */ 3593 + data_space_reserved -= range->len; 3594 + qgroup_reserved -= range->len; 3595 + } else if (data_space_reserved > 0) { 3506 3596 btrfs_free_reserved_data_space(BTRFS_I(inode), 3507 - data_reserved, range->start, 3508 - range->len); 3597 + data_reserved, range->start, 3598 + range->len); 3599 + data_space_reserved -= range->len; 3600 + qgroup_reserved -= range->len; 3601 + } else if (qgroup_reserved > 0) { 3602 + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, 3603 + range->start, range->len); 3604 + qgroup_reserved -= range->len; 3605 + } 3509 3606 list_del(&range->list); 3510 3607 kfree(range); 3511 3608 } ··· 3535 3606 &cached_state); 3536 3607 out: 3537 3608 btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); 3538 - /* Let go of our reservation. */ 3539 - if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) 3540 - btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 3541 - cur_offset, alloc_end - cur_offset); 3542 3609 extent_changeset_free(data_reserved); 3543 3610 return ret; 3544 3611 } ··· 3692 3767 */ 3693 3768 pagefault_disable(); 3694 3769 to->nofault = true; 3695 - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 3696 - IOMAP_DIO_PARTIAL, read); 3770 + ret = btrfs_dio_rw(iocb, to, read); 3697 3771 to->nofault = false; 3698 3772 pagefault_enable(); 3699 3773

+6 -3

fs/btrfs/free-space-cache.c

··· 2630 2630 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2631 2631 u64 bytenr, u64 size, bool used) 2632 2632 { 2633 - struct btrfs_fs_info *fs_info = block_group->fs_info; 2633 + struct btrfs_space_info *sinfo = block_group->space_info; 2634 2634 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2635 2635 u64 offset = bytenr - block_group->start; 2636 2636 u64 to_free, to_unusable; 2637 - const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); 2637 + int bg_reclaim_threshold = 0; 2638 2638 bool initial = (size == block_group->length); 2639 2639 u64 reclaimable_unusable; 2640 2640 2641 2641 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2642 + 2643 + if (!initial) 2644 + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2642 2645 2643 2646 spin_lock(&ctl->tree_lock); 2644 2647 if (!used) ··· 4072 4069 4073 4070 btrfs_info(fs_info, "cleaning free space cache v1"); 4074 4071 4075 - node = rb_first(&fs_info->block_group_cache_tree); 4072 + node = rb_first_cached(&fs_info->block_group_cache_tree); 4076 4073 while (node) { 4077 4074 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4078 4075 ret = btrfs_remove_free_space_inode(trans, NULL, block_group);

+1 -1

fs/btrfs/free-space-tree.c

··· 1178 1178 goto abort; 1179 1179 } 1180 1180 1181 - node = rb_first(&fs_info->block_group_cache_tree); 1181 + node = rb_first_cached(&fs_info->block_group_cache_tree); 1182 1182 while (node) { 1183 1183 block_group = rb_entry(node, struct btrfs_block_group, 1184 1184 cache_node);

+965 -907

fs/btrfs/inode.c

··· 64 64 struct btrfs_dio_data { 65 65 ssize_t submitted; 66 66 struct extent_changeset *data_reserved; 67 + bool data_space_reserved; 68 + bool nocow_done; 67 69 }; 70 + 71 + struct btrfs_dio_private { 72 + struct inode *inode; 73 + 74 + /* 75 + * Since DIO can use anonymous page, we cannot use page_offset() to 76 + * grab the file offset, thus need a dedicated member for file offset. 77 + */ 78 + u64 file_offset; 79 + /* Used for bio::bi_size */ 80 + u32 bytes; 81 + 82 + /* 83 + * References to this structure. There is one reference per in-flight 84 + * bio plus one while we're still setting up. 85 + */ 86 + refcount_t refs; 87 + 88 + /* Array of checksums */ 89 + u8 *csums; 90 + 91 + /* This must be last */ 92 + struct bio bio; 93 + }; 94 + 95 + static struct bio_set btrfs_dio_bioset; 68 96 69 97 struct btrfs_rename_ctx { 70 98 /* Output field. Stores the index number of the old directory entry. */ ··· 250 222 static int btrfs_dirty_inode(struct inode *inode); 251 223 252 224 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 253 - struct inode *inode, struct inode *dir, 254 - const struct qstr *qstr) 225 + struct btrfs_new_inode_args *args) 255 226 { 256 227 int err; 257 228 258 - err = btrfs_init_acl(trans, inode, dir); 259 - if (!err) 260 - err = btrfs_xattr_security_init(trans, inode, dir, qstr); 261 - return err; 229 + if (args->default_acl) { 230 + err = __btrfs_set_acl(trans, args->inode, args->default_acl, 231 + ACL_TYPE_DEFAULT); 232 + if (err) 233 + return err; 234 + } 235 + if (args->acl) { 236 + err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); 237 + if (err) 238 + return err; 239 + } 240 + if (!args->default_acl && !args->acl) 241 + cache_no_acl(args->inode); 242 + return btrfs_xattr_security_init(trans, args->inode, args->dir, 243 + &args->dentry->d_name); 262 244 } 263 245 264 246 /* ··· 1645 1607 nr_written, 1); 1646 1608 } 1647 1609 1610 + struct can_nocow_file_extent_args { 1611 + /* Input fields. */ 1612 + 1613 + /* Start file offset of the range we want to NOCOW. */ 1614 + u64 start; 1615 + /* End file offset (inclusive) of the range we want to NOCOW. */ 1616 + u64 end; 1617 + bool writeback_path; 1618 + bool strict; 1619 + /* 1620 + * Free the path passed to can_nocow_file_extent() once it's not needed 1621 + * anymore. 1622 + */ 1623 + bool free_path; 1624 + 1625 + /* Output fields. Only set when can_nocow_file_extent() returns 1. */ 1626 + 1627 + u64 disk_bytenr; 1628 + u64 disk_num_bytes; 1629 + u64 extent_offset; 1630 + /* Number of bytes that can be written to in NOCOW mode. */ 1631 + u64 num_bytes; 1632 + }; 1633 + 1634 + /* 1635 + * Check if we can NOCOW the file extent that the path points to. 1636 + * This function may return with the path released, so the caller should check 1637 + * if path->nodes[0] is NULL or not if it needs to use the path afterwards. 1638 + * 1639 + * Returns: < 0 on error 1640 + * 0 if we can not NOCOW 1641 + * 1 if we can NOCOW 1642 + */ 1643 + static int can_nocow_file_extent(struct btrfs_path *path, 1644 + struct btrfs_key *key, 1645 + struct btrfs_inode *inode, 1646 + struct can_nocow_file_extent_args *args) 1647 + { 1648 + const bool is_freespace_inode = btrfs_is_free_space_inode(inode); 1649 + struct extent_buffer *leaf = path->nodes[0]; 1650 + struct btrfs_root *root = inode->root; 1651 + struct btrfs_file_extent_item *fi; 1652 + u64 extent_end; 1653 + u8 extent_type; 1654 + int can_nocow = 0; 1655 + int ret = 0; 1656 + 1657 + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 1658 + extent_type = btrfs_file_extent_type(leaf, fi); 1659 + 1660 + if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1661 + goto out; 1662 + 1663 + /* Can't access these fields unless we know it's not an inline extent. */ 1664 + args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1665 + args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1666 + args->extent_offset = btrfs_file_extent_offset(leaf, fi); 1667 + 1668 + if (!(inode->flags & BTRFS_INODE_NODATACOW) && 1669 + extent_type == BTRFS_FILE_EXTENT_REG) 1670 + goto out; 1671 + 1672 + /* 1673 + * If the extent was created before the generation where the last snapshot 1674 + * for its subvolume was created, then this implies the extent is shared, 1675 + * hence we must COW. 1676 + */ 1677 + if (!args->strict && 1678 + btrfs_file_extent_generation(leaf, fi) <= 1679 + btrfs_root_last_snapshot(&root->root_item)) 1680 + goto out; 1681 + 1682 + /* An explicit hole, must COW. */ 1683 + if (args->disk_bytenr == 0) 1684 + goto out; 1685 + 1686 + /* Compressed/encrypted/encoded extents must be COWed. */ 1687 + if (btrfs_file_extent_compression(leaf, fi) || 1688 + btrfs_file_extent_encryption(leaf, fi) || 1689 + btrfs_file_extent_other_encoding(leaf, fi)) 1690 + goto out; 1691 + 1692 + extent_end = btrfs_file_extent_end(path); 1693 + 1694 + /* 1695 + * The following checks can be expensive, as they need to take other 1696 + * locks and do btree or rbtree searches, so release the path to avoid 1697 + * blocking other tasks for too long. 1698 + */ 1699 + btrfs_release_path(path); 1700 + 1701 + ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), 1702 + key->offset - args->extent_offset, 1703 + args->disk_bytenr, false, path); 1704 + WARN_ON_ONCE(ret > 0 && is_freespace_inode); 1705 + if (ret != 0) 1706 + goto out; 1707 + 1708 + if (args->free_path) { 1709 + /* 1710 + * We don't need the path anymore, plus through the 1711 + * csum_exist_in_range() call below we will end up allocating 1712 + * another path. So free the path to avoid unnecessary extra 1713 + * memory usage. 1714 + */ 1715 + btrfs_free_path(path); 1716 + path = NULL; 1717 + } 1718 + 1719 + /* If there are pending snapshots for this root, we must COW. */ 1720 + if (args->writeback_path && !is_freespace_inode && 1721 + atomic_read(&root->snapshot_force_cow)) 1722 + goto out; 1723 + 1724 + args->disk_bytenr += args->extent_offset; 1725 + args->disk_bytenr += args->start - key->offset; 1726 + args->num_bytes = min(args->end + 1, extent_end) - args->start; 1727 + 1728 + /* 1729 + * Force COW if csums exist in the range. This ensures that csums for a 1730 + * given extent are either valid or do not exist. 1731 + */ 1732 + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes); 1733 + WARN_ON_ONCE(ret > 0 && is_freespace_inode); 1734 + if (ret != 0) 1735 + goto out; 1736 + 1737 + can_nocow = 1; 1738 + out: 1739 + if (args->free_path && path) 1740 + btrfs_free_path(path); 1741 + 1742 + return ret < 0 ? ret : can_nocow; 1743 + } 1744 + 1648 1745 /* 1649 1746 * when nowcow writeback call back. This checks for snapshots or COW copies 1650 1747 * of the extents that exist in the file, and COWs the file as required. ··· 1800 1627 u64 cur_offset = start; 1801 1628 int ret; 1802 1629 bool check_prev = true; 1803 - const bool freespace_inode = btrfs_is_free_space_inode(inode); 1804 1630 u64 ino = btrfs_ino(inode); 1631 + struct btrfs_block_group *bg; 1805 1632 bool nocow = false; 1806 - u64 disk_bytenr = 0; 1807 - const bool force = inode->flags & BTRFS_INODE_NODATACOW; 1633 + struct can_nocow_file_extent_args nocow_args = { 0 }; 1808 1634 1809 1635 path = btrfs_alloc_path(); 1810 1636 if (!path) { ··· 1816 1644 return -ENOMEM; 1817 1645 } 1818 1646 1647 + nocow_args.end = end; 1648 + nocow_args.writeback_path = true; 1649 + 1819 1650 while (1) { 1820 1651 struct btrfs_key found_key; 1821 1652 struct btrfs_file_extent_item *fi; 1822 1653 struct extent_buffer *leaf; 1823 1654 u64 extent_end; 1824 - u64 extent_offset; 1825 - u64 num_bytes = 0; 1826 - u64 disk_num_bytes; 1827 1655 u64 ram_bytes; 1656 + u64 nocow_end; 1828 1657 int extent_type; 1829 1658 1830 1659 nocow = false; ··· 1901 1728 fi = btrfs_item_ptr(leaf, path->slots[0], 1902 1729 struct btrfs_file_extent_item); 1903 1730 extent_type = btrfs_file_extent_type(leaf, fi); 1904 - 1905 - ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1906 - if (extent_type == BTRFS_FILE_EXTENT_REG || 1907 - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1908 - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1909 - extent_offset = btrfs_file_extent_offset(leaf, fi); 1910 - extent_end = found_key.offset + 1911 - btrfs_file_extent_num_bytes(leaf, fi); 1912 - disk_num_bytes = 1913 - btrfs_file_extent_disk_num_bytes(leaf, fi); 1914 - /* 1915 - * If the extent we got ends before our current offset, 1916 - * skip to the next extent. 1917 - */ 1918 - if (extent_end <= cur_offset) { 1919 - path->slots[0]++; 1920 - goto next_slot; 1921 - } 1922 - /* Skip holes */ 1923 - if (disk_bytenr == 0) 1924 - goto out_check; 1925 - /* Skip compressed/encrypted/encoded extents */ 1926 - if (btrfs_file_extent_compression(leaf, fi) || 1927 - btrfs_file_extent_encryption(leaf, fi) || 1928 - btrfs_file_extent_other_encoding(leaf, fi)) 1929 - goto out_check; 1930 - /* 1931 - * If extent is created before the last volume's snapshot 1932 - * this implies the extent is shared, hence we can't do 1933 - * nocow. This is the same check as in 1934 - * btrfs_cross_ref_exist but without calling 1935 - * btrfs_search_slot. 1936 - */ 1937 - if (!freespace_inode && 1938 - btrfs_file_extent_generation(leaf, fi) <= 1939 - btrfs_root_last_snapshot(&root->root_item)) 1940 - goto out_check; 1941 - if (extent_type == BTRFS_FILE_EXTENT_REG && !force) 1942 - goto out_check; 1943 - 1944 - /* 1945 - * The following checks can be expensive, as they need to 1946 - * take other locks and do btree or rbtree searches, so 1947 - * release the path to avoid blocking other tasks for too 1948 - * long. 1949 - */ 1950 - btrfs_release_path(path); 1951 - 1952 - ret = btrfs_cross_ref_exist(root, ino, 1953 - found_key.offset - 1954 - extent_offset, disk_bytenr, false); 1955 - if (ret) { 1956 - /* 1957 - * ret could be -EIO if the above fails to read 1958 - * metadata. 1959 - */ 1960 - if (ret < 0) { 1961 - if (cow_start != (u64)-1) 1962 - cur_offset = cow_start; 1963 - goto error; 1964 - } 1965 - 1966 - WARN_ON_ONCE(freespace_inode); 1967 - goto out_check; 1968 - } 1969 - disk_bytenr += extent_offset; 1970 - disk_bytenr += cur_offset - found_key.offset; 1971 - num_bytes = min(end + 1, extent_end) - cur_offset; 1972 - /* 1973 - * If there are pending snapshots for this root, we 1974 - * fall into common COW way 1975 - */ 1976 - if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) 1977 - goto out_check; 1978 - /* 1979 - * force cow if csum exists in the range. 1980 - * this ensure that csum for a given extent are 1981 - * either valid or do not exist. 1982 - */ 1983 - ret = csum_exist_in_range(fs_info, disk_bytenr, 1984 - num_bytes); 1985 - if (ret) { 1986 - /* 1987 - * ret could be -EIO if the above fails to read 1988 - * metadata. 1989 - */ 1990 - if (ret < 0) { 1991 - if (cow_start != (u64)-1) 1992 - cur_offset = cow_start; 1993 - goto error; 1994 - } 1995 - WARN_ON_ONCE(freespace_inode); 1996 - goto out_check; 1997 - } 1998 - /* If the extent's block group is RO, we must COW */ 1999 - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) 2000 - goto out_check; 2001 - nocow = true; 2002 - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 2003 - extent_end = found_key.offset + ram_bytes; 2004 - extent_end = ALIGN(extent_end, fs_info->sectorsize); 2005 - /* Skip extents outside of our requested range */ 2006 - if (extent_end <= start) { 2007 - path->slots[0]++; 2008 - goto next_slot; 2009 - } 2010 - } else { 2011 - /* If this triggers then we have a memory corruption */ 2012 - BUG(); 1731 + /* If this is triggered then we have a memory corruption. */ 1732 + ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); 1733 + if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { 1734 + ret = -EUCLEAN; 1735 + goto error; 2013 1736 } 1737 + ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 1738 + extent_end = btrfs_file_extent_end(path); 1739 + 1740 + /* 1741 + * If the extent we got ends before our current offset, skip to 1742 + * the next extent. 1743 + */ 1744 + if (extent_end <= cur_offset) { 1745 + path->slots[0]++; 1746 + goto next_slot; 1747 + } 1748 + 1749 + nocow_args.start = cur_offset; 1750 + ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); 1751 + if (ret < 0) { 1752 + if (cow_start != (u64)-1) 1753 + cur_offset = cow_start; 1754 + goto error; 1755 + } else if (ret == 0) { 1756 + goto out_check; 1757 + } 1758 + 1759 + ret = 0; 1760 + bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); 1761 + if (bg) 1762 + nocow = true; 2014 1763 out_check: 2015 1764 /* 2016 1765 * If nocow is false then record the beginning of the range ··· 1964 1869 cow_start = (u64)-1; 1965 1870 } 1966 1871 1872 + nocow_end = cur_offset + nocow_args.num_bytes - 1; 1873 + 1967 1874 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 1968 - u64 orig_start = found_key.offset - extent_offset; 1875 + u64 orig_start = found_key.offset - nocow_args.extent_offset; 1969 1876 struct extent_map *em; 1970 1877 1971 - em = create_io_em(inode, cur_offset, num_bytes, 1878 + em = create_io_em(inode, cur_offset, nocow_args.num_bytes, 1972 1879 orig_start, 1973 - disk_bytenr, /* block_start */ 1974 - num_bytes, /* block_len */ 1975 - disk_num_bytes, /* orig_block_len */ 1880 + nocow_args.disk_bytenr, /* block_start */ 1881 + nocow_args.num_bytes, /* block_len */ 1882 + nocow_args.disk_num_bytes, /* orig_block_len */ 1976 1883 ram_bytes, BTRFS_COMPRESS_NONE, 1977 1884 BTRFS_ORDERED_PREALLOC); 1978 1885 if (IS_ERR(em)) { ··· 1983 1886 } 1984 1887 free_extent_map(em); 1985 1888 ret = btrfs_add_ordered_extent(inode, 1986 - cur_offset, num_bytes, num_bytes, 1987 - disk_bytenr, num_bytes, 0, 1889 + cur_offset, nocow_args.num_bytes, 1890 + nocow_args.num_bytes, 1891 + nocow_args.disk_bytenr, 1892 + nocow_args.num_bytes, 0, 1988 1893 1 << BTRFS_ORDERED_PREALLOC, 1989 1894 BTRFS_COMPRESS_NONE); 1990 1895 if (ret) { 1991 1896 btrfs_drop_extent_cache(inode, cur_offset, 1992 - cur_offset + num_bytes - 1, 1993 - 0); 1897 + nocow_end, 0); 1994 1898 goto error; 1995 1899 } 1996 1900 } else { 1997 1901 ret = btrfs_add_ordered_extent(inode, cur_offset, 1998 - num_bytes, num_bytes, 1999 - disk_bytenr, num_bytes, 1902 + nocow_args.num_bytes, 1903 + nocow_args.num_bytes, 1904 + nocow_args.disk_bytenr, 1905 + nocow_args.num_bytes, 2000 1906 0, 2001 1907 1 << BTRFS_ORDERED_NOCOW, 2002 1908 BTRFS_COMPRESS_NONE); ··· 2007 1907 goto error; 2008 1908 } 2009 1909 2010 - if (nocow) 2011 - btrfs_dec_nocow_writers(fs_info, disk_bytenr); 2012 - nocow = false; 1910 + if (nocow) { 1911 + btrfs_dec_nocow_writers(bg); 1912 + nocow = false; 1913 + } 2013 1914 2014 1915 if (btrfs_is_data_reloc_root(root)) 2015 1916 /* ··· 2019 1918 * from freeing metadata of created ordered extent. 2020 1919 */ 2021 1920 ret = btrfs_reloc_clone_csums(inode, cur_offset, 2022 - num_bytes); 1921 + nocow_args.num_bytes); 2023 1922 2024 - extent_clear_unlock_delalloc(inode, cur_offset, 2025 - cur_offset + num_bytes - 1, 1923 + extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, 2026 1924 locked_page, EXTENT_LOCKED | 2027 1925 EXTENT_DELALLOC | 2028 1926 EXTENT_CLEAR_DATA_RESV, ··· 2054 1954 2055 1955 error: 2056 1956 if (nocow) 2057 - btrfs_dec_nocow_writers(fs_info, disk_bytenr); 1957 + btrfs_dec_nocow_writers(bg); 2058 1958 2059 1959 if (ret && cur_offset < end) 2060 1960 extent_clear_unlock_delalloc(inode, cur_offset, end, ··· 2598 2498 * 2599 2499 * c-3) otherwise: async submit 2600 2500 */ 2601 - blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 2602 - int mirror_num, unsigned long bio_flags) 2603 - 2501 + void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 2502 + int mirror_num, enum btrfs_compression_type compress_type) 2604 2503 { 2605 2504 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2606 2505 struct btrfs_root *root = BTRFS_I(inode)->root; ··· 2628 2529 if (ret) 2629 2530 goto out; 2630 2531 2631 - if (bio_flags & EXTENT_BIO_COMPRESSED) { 2532 + if (compress_type != BTRFS_COMPRESS_NONE) { 2632 2533 /* 2633 2534 * btrfs_submit_compressed_read will handle completing 2634 2535 * the bio if there were any errors, so just return 2635 2536 * here. 2636 2537 */ 2637 - ret = btrfs_submit_compressed_read(inode, bio, 2638 - mirror_num, 2639 - bio_flags); 2640 - goto out_no_endio; 2538 + btrfs_submit_compressed_read(inode, bio, mirror_num); 2539 + return; 2641 2540 } else { 2642 2541 /* 2643 2542 * Lookup bio sums does extra checks around whether we ··· 2652 2555 if (btrfs_is_data_reloc_root(root)) 2653 2556 goto mapit; 2654 2557 /* we're doing a write, do the async checksumming */ 2655 - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, 2558 + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 2656 2559 0, btrfs_submit_bio_start); 2657 2560 goto out; 2658 2561 } else if (!skip_sum) { ··· 2669 2572 bio->bi_status = ret; 2670 2573 bio_endio(bio); 2671 2574 } 2672 - out_no_endio: 2673 - return ret; 2674 2575 } 2675 2576 2676 2577 /* ··· 3359 3264 shash->tfm = fs_info->csum_shash; 3360 3265 3361 3266 crypto_shash_digest(shash, kaddr + pgoff, len, csum); 3267 + kunmap_atomic(kaddr); 3362 3268 3363 3269 if (memcmp(csum, csum_expected, csum_size)) 3364 3270 goto zeroit; 3365 3271 3366 - kunmap_atomic(kaddr); 3367 3272 return 0; 3368 3273 zeroit: 3369 3274 btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, ··· 3371 3276 if (bbio->device) 3372 3277 btrfs_dev_stat_inc_and_print(bbio->device, 3373 3278 BTRFS_DEV_STAT_CORRUPTION_ERRS); 3374 - memset(kaddr + pgoff, 1, len); 3375 - flush_dcache_page(page); 3376 - kunmap_atomic(kaddr); 3279 + memzero_page(page, pgoff, len); 3377 3280 return -EIO; 3378 3281 } 3379 3282 ··· 3576 3483 u64 last_objectid = 0; 3577 3484 int ret = 0, nr_unlink = 0; 3578 3485 3486 + /* Bail out if the cleanup is already running. */ 3579 3487 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) 3580 3488 return 0; 3581 3489 ··· 3659 3565 * 3660 3566 * btrfs_find_orphan_roots() ran before us, which has 3661 3567 * found all deleted roots and loaded them into 3662 - * fs_info->fs_roots_radix. So here we can find if an 3568 + * fs_info->fs_roots. So here we can find if an 3663 3569 * orphan item corresponds to a deleted root by looking 3664 - * up the root from that radix tree. 3570 + * up the root from that xarray. 3665 3571 */ 3666 3572 3667 - spin_lock(&fs_info->fs_roots_radix_lock); 3668 - dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, 3669 - (unsigned long)found_key.objectid); 3573 + spin_lock(&fs_info->fs_roots_lock); 3574 + dead_root = xa_load(&fs_info->fs_roots, 3575 + (unsigned long)found_key.objectid); 3670 3576 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) 3671 3577 is_dead_root = 1; 3672 - spin_unlock(&fs_info->fs_roots_radix_lock); 3578 + spin_unlock(&fs_info->fs_roots_lock); 3673 3579 3674 3580 if (is_dead_root) { 3675 3581 /* prevent this orphan from being found again */ ··· 3909 3815 * cache. 3910 3816 * 3911 3817 * This is required for both inode re-read from disk and delayed inode 3912 - * in delayed_nodes_tree. 3818 + * in the delayed_nodes xarray. 3913 3819 */ 3914 3820 if (BTRFS_I(inode)->last_trans == fs_info->generation) 3915 3821 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, ··· 4293 4199 * 1 for the dir index 4294 4200 * 1 for the inode ref 4295 4201 * 1 for the inode 4202 + * 1 for the parent inode 4296 4203 */ 4297 - return btrfs_start_transaction_fallback_global_rsv(root, 5); 4204 + return btrfs_start_transaction_fallback_global_rsv(root, 6); 4298 4205 } 4299 4206 4300 4207 static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ··· 4788 4693 goto out; 4789 4694 } 4790 4695 } 4791 - ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); 4696 + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); 4792 4697 if (ret < 0) { 4793 4698 if (!only_release_metadata) 4794 4699 btrfs_free_reserved_data_space(inode, data_reserved, ··· 5875 5780 struct list_head ins_list; 5876 5781 struct list_head del_list; 5877 5782 int ret; 5878 - struct extent_buffer *leaf; 5879 - int slot; 5880 5783 char *name_ptr; 5881 5784 int name_len; 5882 5785 int entries = 0; ··· 5901 5808 key.offset = ctx->pos; 5902 5809 key.objectid = btrfs_ino(BTRFS_I(inode)); 5903 5810 5904 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5905 - if (ret < 0) 5906 - goto err; 5907 - 5908 - while (1) { 5811 + btrfs_for_each_slot(root, &key, &found_key, path, ret) { 5909 5812 struct dir_entry *entry; 5910 - 5911 - leaf = path->nodes[0]; 5912 - slot = path->slots[0]; 5913 - if (slot >= btrfs_header_nritems(leaf)) { 5914 - ret = btrfs_next_leaf(root, path); 5915 - if (ret < 0) 5916 - goto err; 5917 - else if (ret > 0) 5918 - break; 5919 - continue; 5920 - } 5921 - 5922 - btrfs_item_key_to_cpu(leaf, &found_key, slot); 5813 + struct extent_buffer *leaf = path->nodes[0]; 5923 5814 5924 5815 if (found_key.objectid != key.objectid) 5925 5816 break; 5926 5817 if (found_key.type != BTRFS_DIR_INDEX_KEY) 5927 5818 break; 5928 5819 if (found_key.offset < ctx->pos) 5929 - goto next; 5820 + continue; 5930 5821 if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) 5931 - goto next; 5932 - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5822 + continue; 5823 + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 5933 5824 name_len = btrfs_dir_name_len(leaf, di); 5934 5825 if ((total_len + sizeof(struct dir_entry) + name_len) >= 5935 5826 PAGE_SIZE) { ··· 5940 5863 entries++; 5941 5864 addr += sizeof(struct dir_entry) + name_len; 5942 5865 total_len += sizeof(struct dir_entry) + name_len; 5943 - next: 5944 - path->slots[0]++; 5945 5866 } 5867 + /* Catch error encountered during iteration */ 5868 + if (ret < 0) 5869 + goto err; 5870 + 5946 5871 btrfs_release_path(path); 5947 5872 5948 5873 ret = btrfs_filldir(private->filldir_buf, entries, ctx); ··· 6132 6053 btrfs_find_actor, &args); 6133 6054 } 6134 6055 6056 + int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, 6057 + unsigned int *trans_num_items) 6058 + { 6059 + struct inode *dir = args->dir; 6060 + struct inode *inode = args->inode; 6061 + int ret; 6062 + 6063 + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); 6064 + if (ret) 6065 + return ret; 6066 + 6067 + /* 1 to add inode item */ 6068 + *trans_num_items = 1; 6069 + /* 1 to add compression property */ 6070 + if (BTRFS_I(dir)->prop_compress) 6071 + (*trans_num_items)++; 6072 + /* 1 to add default ACL xattr */ 6073 + if (args->default_acl) 6074 + (*trans_num_items)++; 6075 + /* 1 to add access ACL xattr */ 6076 + if (args->acl) 6077 + (*trans_num_items)++; 6078 + #ifdef CONFIG_SECURITY 6079 + /* 1 to add LSM xattr */ 6080 + if (dir->i_security) 6081 + (*trans_num_items)++; 6082 + #endif 6083 + if (args->orphan) { 6084 + /* 1 to add orphan item */ 6085 + (*trans_num_items)++; 6086 + } else { 6087 + /* 6088 + * 1 to add dir item 6089 + * 1 to add dir index 6090 + * 1 to update parent inode item 6091 + * 6092 + * No need for 1 unit for the inode ref item because it is 6093 + * inserted in a batch together with the inode item at 6094 + * btrfs_create_new_inode(). 6095 + */ 6096 + *trans_num_items += 3; 6097 + } 6098 + return 0; 6099 + } 6100 + 6101 + void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) 6102 + { 6103 + posix_acl_release(args->acl); 6104 + posix_acl_release(args->default_acl); 6105 + } 6106 + 6135 6107 /* 6136 6108 * Inherit flags from the parent inode. 6137 6109 * ··· 6191 6061 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 6192 6062 { 6193 6063 unsigned int flags; 6194 - 6195 - if (!dir) 6196 - return; 6197 6064 6198 6065 flags = BTRFS_I(dir)->flags; 6199 6066 ··· 6211 6084 btrfs_sync_inode_flags_to_i_flags(inode); 6212 6085 } 6213 6086 6214 - static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 6215 - struct btrfs_root *root, 6216 - struct user_namespace *mnt_userns, 6217 - struct inode *dir, 6218 - const char *name, int name_len, 6219 - u64 ref_objectid, u64 objectid, 6220 - umode_t mode, u64 *index) 6087 + int btrfs_create_new_inode(struct btrfs_trans_handle *trans, 6088 + struct btrfs_new_inode_args *args) 6221 6089 { 6222 - struct btrfs_fs_info *fs_info = root->fs_info; 6223 - struct inode *inode; 6090 + struct inode *dir = args->dir; 6091 + struct inode *inode = args->inode; 6092 + const char *name = args->orphan ? NULL : args->dentry->d_name.name; 6093 + int name_len = args->orphan ? 0 : args->dentry->d_name.len; 6094 + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6095 + struct btrfs_root *root; 6224 6096 struct btrfs_inode_item *inode_item; 6225 6097 struct btrfs_key *location; 6226 6098 struct btrfs_path *path; 6099 + u64 objectid; 6227 6100 struct btrfs_inode_ref *ref; 6228 6101 struct btrfs_key key[2]; 6229 6102 u32 sizes[2]; 6230 6103 struct btrfs_item_batch batch; 6231 6104 unsigned long ptr; 6232 - unsigned int nofs_flag; 6233 6105 int ret; 6234 6106 6235 6107 path = btrfs_alloc_path(); 6236 6108 if (!path) 6237 - return ERR_PTR(-ENOMEM); 6109 + return -ENOMEM; 6238 6110 6239 - nofs_flag = memalloc_nofs_save(); 6240 - inode = new_inode(fs_info->sb); 6241 - memalloc_nofs_restore(nofs_flag); 6242 - if (!inode) { 6243 - btrfs_free_path(path); 6244 - return ERR_PTR(-ENOMEM); 6245 - } 6111 + if (!args->subvol) 6112 + BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); 6113 + root = BTRFS_I(inode)->root; 6246 6114 6247 - /* 6248 - * O_TMPFILE, set link count to 0, so that after this point, 6249 - * we fill in an inode item with the correct link count. 6250 - */ 6251 - if (!name) 6252 - set_nlink(inode, 0); 6253 - 6254 - /* 6255 - * we have to initialize this early, so we can reclaim the inode 6256 - * number if we fail afterwards in this function. 6257 - */ 6115 + ret = btrfs_get_free_objectid(root, &objectid); 6116 + if (ret) 6117 + goto out; 6258 6118 inode->i_ino = objectid; 6259 6119 6260 - if (dir && name) { 6120 + if (args->orphan) { 6121 + /* 6122 + * O_TMPFILE, set link count to 0, so that after this point, we 6123 + * fill in an inode item with the correct link count. 6124 + */ 6125 + set_nlink(inode, 0); 6126 + } else { 6261 6127 trace_btrfs_inode_request(dir); 6262 6128 6263 - ret = btrfs_set_inode_index(BTRFS_I(dir), index); 6264 - if (ret) { 6265 - btrfs_free_path(path); 6266 - iput(inode); 6267 - return ERR_PTR(ret); 6268 - } 6269 - } else if (dir) { 6270 - *index = 0; 6129 + ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); 6130 + if (ret) 6131 + goto out; 6271 6132 } 6272 - /* 6273 - * index_cnt is ignored for everything but a dir, 6274 - * btrfs_set_inode_index_count has an explanation for the magic 6275 - * number 6276 - */ 6277 - BTRFS_I(inode)->index_cnt = 2; 6278 - BTRFS_I(inode)->dir_index = *index; 6279 - BTRFS_I(inode)->root = btrfs_grab_root(root); 6133 + /* index_cnt is ignored for everything but a dir. */ 6134 + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; 6280 6135 BTRFS_I(inode)->generation = trans->transid; 6281 6136 inode->i_generation = BTRFS_I(inode)->generation; 6137 + 6138 + /* 6139 + * Subvolumes don't inherit flags from their parent directory. 6140 + * Originally this was probably by accident, but we probably can't 6141 + * change it now without compatibility issues. 6142 + */ 6143 + if (!args->subvol) 6144 + btrfs_inherit_iflags(inode, dir); 6145 + 6146 + if (S_ISREG(inode->i_mode)) { 6147 + if (btrfs_test_opt(fs_info, NODATASUM)) 6148 + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6149 + if (btrfs_test_opt(fs_info, NODATACOW)) 6150 + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6151 + BTRFS_INODE_NODATASUM; 6152 + } 6153 + 6154 + location = &BTRFS_I(inode)->location; 6155 + location->objectid = objectid; 6156 + location->offset = 0; 6157 + location->type = BTRFS_INODE_ITEM_KEY; 6158 + 6159 + ret = btrfs_insert_inode_locked(inode); 6160 + if (ret < 0) { 6161 + if (!args->orphan) 6162 + BTRFS_I(dir)->index_cnt--; 6163 + goto out; 6164 + } 6282 6165 6283 6166 /* 6284 6167 * We could have gotten an inode number from somebody who was fsynced ··· 6304 6167 6305 6168 sizes[0] = sizeof(struct btrfs_inode_item); 6306 6169 6307 - if (name) { 6170 + if (!args->orphan) { 6308 6171 /* 6309 6172 * Start new inodes with an inode_ref. This is slightly more 6310 6173 * efficient for small numbers of hard links since they will ··· 6313 6176 */ 6314 6177 key[1].objectid = objectid; 6315 6178 key[1].type = BTRFS_INODE_REF_KEY; 6316 - key[1].offset = ref_objectid; 6317 - 6318 - sizes[1] = name_len + sizeof(*ref); 6319 - } 6320 - 6321 - location = &BTRFS_I(inode)->location; 6322 - location->objectid = objectid; 6323 - location->offset = 0; 6324 - location->type = BTRFS_INODE_ITEM_KEY; 6325 - 6326 - ret = btrfs_insert_inode_locked(inode); 6327 - if (ret < 0) { 6328 - iput(inode); 6329 - goto fail; 6179 + if (args->subvol) { 6180 + key[1].offset = objectid; 6181 + sizes[1] = 2 + sizeof(*ref); 6182 + } else { 6183 + key[1].offset = btrfs_ino(BTRFS_I(dir)); 6184 + sizes[1] = name_len + sizeof(*ref); 6185 + } 6330 6186 } 6331 6187 6332 6188 batch.keys = &key[0]; 6333 6189 batch.data_sizes = &sizes[0]; 6334 - batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); 6335 - batch.nr = name ? 2 : 1; 6190 + batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); 6191 + batch.nr = args->orphan ? 1 : 2; 6336 6192 ret = btrfs_insert_empty_items(trans, root, path, &batch); 6337 - if (ret != 0) 6338 - goto fail_unlock; 6339 - 6340 - inode_init_owner(mnt_userns, inode, dir, mode); 6341 - inode_set_bytes(inode, 0); 6193 + if (ret != 0) { 6194 + btrfs_abort_transaction(trans, ret); 6195 + goto discard; 6196 + } 6342 6197 6343 6198 inode->i_mtime = current_time(inode); 6344 6199 inode->i_atime = inode->i_mtime; 6345 6200 inode->i_ctime = inode->i_mtime; 6346 6201 BTRFS_I(inode)->i_otime = inode->i_mtime; 6202 + 6203 + /* 6204 + * We're going to fill the inode item now, so at this point the inode 6205 + * must be fully initialized. 6206 + */ 6347 6207 6348 6208 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 6349 6209 struct btrfs_inode_item); ··· 6348 6214 sizeof(*inode_item)); 6349 6215 fill_inode_item(trans, path->nodes[0], inode_item, inode); 6350 6216 6351 - if (name) { 6217 + if (!args->orphan) { 6352 6218 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, 6353 6219 struct btrfs_inode_ref); 6354 - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6355 - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); 6356 6220 ptr = (unsigned long)(ref + 1); 6357 - write_extent_buffer(path->nodes[0], name, ptr, name_len); 6221 + if (args->subvol) { 6222 + btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); 6223 + btrfs_set_inode_ref_index(path->nodes[0], ref, 0); 6224 + write_extent_buffer(path->nodes[0], "..", ptr, 2); 6225 + } else { 6226 + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); 6227 + btrfs_set_inode_ref_index(path->nodes[0], ref, 6228 + BTRFS_I(inode)->dir_index); 6229 + write_extent_buffer(path->nodes[0], name, ptr, name_len); 6230 + } 6358 6231 } 6359 6232 6360 6233 btrfs_mark_buffer_dirty(path->nodes[0]); 6361 - btrfs_free_path(path); 6234 + btrfs_release_path(path); 6362 6235 6363 - btrfs_inherit_iflags(inode, dir); 6236 + if (args->subvol) { 6237 + struct inode *parent; 6364 6238 6365 - if (S_ISREG(mode)) { 6366 - if (btrfs_test_opt(fs_info, NODATASUM)) 6367 - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 6368 - if (btrfs_test_opt(fs_info, NODATACOW)) 6369 - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | 6370 - BTRFS_INODE_NODATASUM; 6239 + /* 6240 + * Subvolumes inherit properties from their parent subvolume, 6241 + * not the directory they were created in. 6242 + */ 6243 + parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, 6244 + BTRFS_I(dir)->root); 6245 + if (IS_ERR(parent)) { 6246 + ret = PTR_ERR(parent); 6247 + } else { 6248 + ret = btrfs_inode_inherit_props(trans, inode, parent); 6249 + iput(parent); 6250 + } 6251 + } else { 6252 + ret = btrfs_inode_inherit_props(trans, inode, dir); 6253 + } 6254 + if (ret) { 6255 + btrfs_err(fs_info, 6256 + "error inheriting props for ino %llu (root %llu): %d", 6257 + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, 6258 + ret); 6259 + } 6260 + 6261 + /* 6262 + * Subvolumes don't inherit ACLs or get passed to the LSM. This is 6263 + * probably a bug. 6264 + */ 6265 + if (!args->subvol) { 6266 + ret = btrfs_init_inode_security(trans, args); 6267 + if (ret) { 6268 + btrfs_abort_transaction(trans, ret); 6269 + goto discard; 6270 + } 6371 6271 } 6372 6272 6373 6273 inode_tree_add(inode); ··· 6411 6243 6412 6244 btrfs_update_root_times(trans, root); 6413 6245 6414 - ret = btrfs_inode_inherit_props(trans, inode, dir); 6415 - if (ret) 6416 - btrfs_err(fs_info, 6417 - "error inheriting props for ino %llu (root %llu): %d", 6418 - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); 6246 + if (args->orphan) { 6247 + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 6248 + } else { 6249 + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 6250 + name_len, 0, BTRFS_I(inode)->dir_index); 6251 + } 6252 + if (ret) { 6253 + btrfs_abort_transaction(trans, ret); 6254 + goto discard; 6255 + } 6419 6256 6420 - return inode; 6257 + ret = 0; 6258 + goto out; 6421 6259 6422 - fail_unlock: 6260 + discard: 6261 + /* 6262 + * discard_new_inode() calls iput(), but the caller owns the reference 6263 + * to the inode. 6264 + */ 6265 + ihold(inode); 6423 6266 discard_new_inode(inode); 6424 - fail: 6425 - if (dir && name) 6426 - BTRFS_I(dir)->index_cnt--; 6267 + out: 6427 6268 btrfs_free_path(path); 6428 - return ERR_PTR(ret); 6269 + return ret; 6429 6270 } 6430 6271 6431 6272 /* ··· 6526 6349 return ret; 6527 6350 } 6528 6351 6529 - static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 6530 - struct btrfs_inode *dir, struct dentry *dentry, 6531 - struct btrfs_inode *inode, int backref, u64 index) 6352 + static int btrfs_create_common(struct inode *dir, struct dentry *dentry, 6353 + struct inode *inode) 6532 6354 { 6533 - int err = btrfs_add_link(trans, dir, inode, 6534 - dentry->d_name.name, dentry->d_name.len, 6535 - backref, index); 6536 - if (err > 0) 6537 - err = -EEXIST; 6355 + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6356 + struct btrfs_root *root = BTRFS_I(dir)->root; 6357 + struct btrfs_new_inode_args new_inode_args = { 6358 + .dir = dir, 6359 + .dentry = dentry, 6360 + .inode = inode, 6361 + }; 6362 + unsigned int trans_num_items; 6363 + struct btrfs_trans_handle *trans; 6364 + int err; 6365 + 6366 + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 6367 + if (err) 6368 + goto out_inode; 6369 + 6370 + trans = btrfs_start_transaction(root, trans_num_items); 6371 + if (IS_ERR(trans)) { 6372 + err = PTR_ERR(trans); 6373 + goto out_new_inode_args; 6374 + } 6375 + 6376 + err = btrfs_create_new_inode(trans, &new_inode_args); 6377 + if (!err) 6378 + d_instantiate_new(dentry, inode); 6379 + 6380 + btrfs_end_transaction(trans); 6381 + btrfs_btree_balance_dirty(fs_info); 6382 + out_new_inode_args: 6383 + btrfs_new_inode_args_destroy(&new_inode_args); 6384 + out_inode: 6385 + if (err) 6386 + iput(inode); 6538 6387 return err; 6539 6388 } 6540 6389 6541 6390 static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, 6542 6391 struct dentry *dentry, umode_t mode, dev_t rdev) 6543 6392 { 6544 - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6545 - struct btrfs_trans_handle *trans; 6546 - struct btrfs_root *root = BTRFS_I(dir)->root; 6547 - struct inode *inode = NULL; 6548 - int err; 6549 - u64 objectid; 6550 - u64 index = 0; 6393 + struct inode *inode; 6551 6394 6552 - /* 6553 - * 2 for inode item and ref 6554 - * 2 for dir items 6555 - * 1 for xattr if selinux is on 6556 - */ 6557 - trans = btrfs_start_transaction(root, 5); 6558 - if (IS_ERR(trans)) 6559 - return PTR_ERR(trans); 6560 - 6561 - err = btrfs_get_free_objectid(root, &objectid); 6562 - if (err) 6563 - goto out_unlock; 6564 - 6565 - inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6566 - dentry->d_name.name, dentry->d_name.len, 6567 - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 6568 - if (IS_ERR(inode)) { 6569 - err = PTR_ERR(inode); 6570 - inode = NULL; 6571 - goto out_unlock; 6572 - } 6573 - 6574 - /* 6575 - * If the active LSM wants to access the inode during 6576 - * d_instantiate it needs these. Smack checks to see 6577 - * if the filesystem supports xattrs by looking at the 6578 - * ops vector. 6579 - */ 6395 + inode = new_inode(dir->i_sb); 6396 + if (!inode) 6397 + return -ENOMEM; 6398 + inode_init_owner(mnt_userns, inode, dir, mode); 6580 6399 inode->i_op = &btrfs_special_inode_operations; 6581 6400 init_special_inode(inode, inode->i_mode, rdev); 6582 - 6583 - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6584 - if (err) 6585 - goto out_unlock; 6586 - 6587 - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6588 - 0, index); 6589 - if (err) 6590 - goto out_unlock; 6591 - 6592 - btrfs_update_inode(trans, root, BTRFS_I(inode)); 6593 - d_instantiate_new(dentry, inode); 6594 - 6595 - out_unlock: 6596 - btrfs_end_transaction(trans); 6597 - btrfs_btree_balance_dirty(fs_info); 6598 - if (err && inode) { 6599 - inode_dec_link_count(inode); 6600 - discard_new_inode(inode); 6601 - } 6602 - return err; 6401 + return btrfs_create_common(dir, dentry, inode); 6603 6402 } 6604 6403 6605 6404 static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, 6606 6405 struct dentry *dentry, umode_t mode, bool excl) 6607 6406 { 6608 - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6609 - struct btrfs_trans_handle *trans; 6610 - struct btrfs_root *root = BTRFS_I(dir)->root; 6611 - struct inode *inode = NULL; 6612 - int err; 6613 - u64 objectid; 6614 - u64 index = 0; 6407 + struct inode *inode; 6615 6408 6616 - /* 6617 - * 2 for inode item and ref 6618 - * 2 for dir items 6619 - * 1 for xattr if selinux is on 6620 - */ 6621 - trans = btrfs_start_transaction(root, 5); 6622 - if (IS_ERR(trans)) 6623 - return PTR_ERR(trans); 6624 - 6625 - err = btrfs_get_free_objectid(root, &objectid); 6626 - if (err) 6627 - goto out_unlock; 6628 - 6629 - inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6630 - dentry->d_name.name, dentry->d_name.len, 6631 - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 6632 - if (IS_ERR(inode)) { 6633 - err = PTR_ERR(inode); 6634 - inode = NULL; 6635 - goto out_unlock; 6636 - } 6637 - /* 6638 - * If the active LSM wants to access the inode during 6639 - * d_instantiate it needs these. Smack checks to see 6640 - * if the filesystem supports xattrs by looking at the 6641 - * ops vector. 6642 - */ 6409 + inode = new_inode(dir->i_sb); 6410 + if (!inode) 6411 + return -ENOMEM; 6412 + inode_init_owner(mnt_userns, inode, dir, mode); 6643 6413 inode->i_fop = &btrfs_file_operations; 6644 6414 inode->i_op = &btrfs_file_inode_operations; 6645 6415 inode->i_mapping->a_ops = &btrfs_aops; 6646 - 6647 - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6648 - if (err) 6649 - goto out_unlock; 6650 - 6651 - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6652 - if (err) 6653 - goto out_unlock; 6654 - 6655 - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6656 - 0, index); 6657 - if (err) 6658 - goto out_unlock; 6659 - 6660 - d_instantiate_new(dentry, inode); 6661 - 6662 - out_unlock: 6663 - btrfs_end_transaction(trans); 6664 - if (err && inode) { 6665 - inode_dec_link_count(inode); 6666 - discard_new_inode(inode); 6667 - } 6668 - btrfs_btree_balance_dirty(fs_info); 6669 - return err; 6416 + return btrfs_create_common(dir, dentry, inode); 6670 6417 } 6671 6418 6672 6419 static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ··· 6636 6535 ihold(inode); 6637 6536 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); 6638 6537 6639 - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 6640 - 1, index); 6538 + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6539 + dentry->d_name.name, dentry->d_name.len, 1, index); 6641 6540 6642 6541 if (err) { 6643 6542 drop_inode = 1; ··· 6674 6573 static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, 6675 6574 struct dentry *dentry, umode_t mode) 6676 6575 { 6677 - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 6678 - struct inode *inode = NULL; 6679 - struct btrfs_trans_handle *trans; 6680 - struct btrfs_root *root = BTRFS_I(dir)->root; 6681 - int err = 0; 6682 - u64 objectid = 0; 6683 - u64 index = 0; 6576 + struct inode *inode; 6684 6577 6685 - /* 6686 - * 2 items for inode and ref 6687 - * 2 items for dir items 6688 - * 1 for xattr if selinux is on 6689 - */ 6690 - trans = btrfs_start_transaction(root, 5); 6691 - if (IS_ERR(trans)) 6692 - return PTR_ERR(trans); 6693 - 6694 - err = btrfs_get_free_objectid(root, &objectid); 6695 - if (err) 6696 - goto out_fail; 6697 - 6698 - inode = btrfs_new_inode(trans, root, mnt_userns, dir, 6699 - dentry->d_name.name, dentry->d_name.len, 6700 - btrfs_ino(BTRFS_I(dir)), objectid, 6701 - S_IFDIR | mode, &index); 6702 - if (IS_ERR(inode)) { 6703 - err = PTR_ERR(inode); 6704 - inode = NULL; 6705 - goto out_fail; 6706 - } 6707 - 6708 - /* these must be set before we unlock the inode */ 6578 + inode = new_inode(dir->i_sb); 6579 + if (!inode) 6580 + return -ENOMEM; 6581 + inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); 6709 6582 inode->i_op = &btrfs_dir_inode_operations; 6710 6583 inode->i_fop = &btrfs_dir_file_operations; 6711 - 6712 - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6713 - if (err) 6714 - goto out_fail; 6715 - 6716 - btrfs_i_size_write(BTRFS_I(inode), 0); 6717 - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 6718 - if (err) 6719 - goto out_fail; 6720 - 6721 - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 6722 - dentry->d_name.name, 6723 - dentry->d_name.len, 0, index); 6724 - if (err) 6725 - goto out_fail; 6726 - 6727 - d_instantiate_new(dentry, inode); 6728 - 6729 - out_fail: 6730 - btrfs_end_transaction(trans); 6731 - if (err && inode) { 6732 - inode_dec_link_count(inode); 6733 - discard_new_inode(inode); 6734 - } 6735 - btrfs_btree_balance_dirty(fs_info); 6736 - return err; 6584 + return btrfs_create_common(dir, dentry, inode); 6737 6585 } 6738 6586 6739 6587 static noinline int uncompress_inline(struct btrfs_path *path, ··· 7191 7141 u64 *ram_bytes, bool strict) 7192 7142 { 7193 7143 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7144 + struct can_nocow_file_extent_args nocow_args = { 0 }; 7194 7145 struct btrfs_path *path; 7195 7146 int ret; 7196 7147 struct extent_buffer *leaf; ··· 7199 7148 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7200 7149 struct btrfs_file_extent_item *fi; 7201 7150 struct btrfs_key key; 7202 - u64 disk_bytenr; 7203 - u64 backref_offset; 7204 - u64 extent_end; 7205 - u64 num_bytes; 7206 - int slot; 7207 7151 int found_type; 7208 - bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 7209 7152 7210 7153 path = btrfs_alloc_path(); 7211 7154 if (!path) ··· 7210 7165 if (ret < 0) 7211 7166 goto out; 7212 7167 7213 - slot = path->slots[0]; 7214 7168 if (ret == 1) { 7215 - if (slot == 0) { 7169 + if (path->slots[0] == 0) { 7216 7170 /* can't find the item, must cow */ 7217 7171 ret = 0; 7218 7172 goto out; 7219 7173 } 7220 - slot--; 7174 + path->slots[0]--; 7221 7175 } 7222 7176 ret = 0; 7223 7177 leaf = path->nodes[0]; 7224 - btrfs_item_key_to_cpu(leaf, &key, slot); 7178 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 7225 7179 if (key.objectid != btrfs_ino(BTRFS_I(inode)) || 7226 7180 key.type != BTRFS_EXTENT_DATA_KEY) { 7227 7181 /* not our file or wrong item type, must cow */ ··· 7232 7188 goto out; 7233 7189 } 7234 7190 7235 - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 7191 + if (btrfs_file_extent_end(path) <= offset) 7192 + goto out; 7193 + 7194 + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 7236 7195 found_type = btrfs_file_extent_type(leaf, fi); 7237 - if (found_type != BTRFS_FILE_EXTENT_REG && 7238 - found_type != BTRFS_FILE_EXTENT_PREALLOC) { 7239 - /* not a regular extent, must cow */ 7240 - goto out; 7241 - } 7242 - 7243 - if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 7244 - goto out; 7245 - 7246 - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 7247 - if (extent_end <= offset) 7248 - goto out; 7249 - 7250 - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 7251 - if (disk_bytenr == 0) 7252 - goto out; 7253 - 7254 - if (btrfs_file_extent_compression(leaf, fi) || 7255 - btrfs_file_extent_encryption(leaf, fi) || 7256 - btrfs_file_extent_other_encoding(leaf, fi)) 7257 - goto out; 7258 - 7259 - /* 7260 - * Do the same check as in btrfs_cross_ref_exist but without the 7261 - * unnecessary search. 7262 - */ 7263 - if (!strict && 7264 - (btrfs_file_extent_generation(leaf, fi) <= 7265 - btrfs_root_last_snapshot(&root->root_item))) 7266 - goto out; 7267 - 7268 - backref_offset = btrfs_file_extent_offset(leaf, fi); 7269 - 7270 - if (orig_start) { 7271 - *orig_start = key.offset - backref_offset; 7272 - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 7196 + if (ram_bytes) 7273 7197 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 7198 + 7199 + nocow_args.start = offset; 7200 + nocow_args.end = offset + *len - 1; 7201 + nocow_args.strict = strict; 7202 + nocow_args.free_path = true; 7203 + 7204 + ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); 7205 + /* can_nocow_file_extent() has freed the path. */ 7206 + path = NULL; 7207 + 7208 + if (ret != 1) { 7209 + /* Treat errors as not being able to NOCOW. */ 7210 + ret = 0; 7211 + goto out; 7274 7212 } 7275 7213 7276 - if (btrfs_extent_readonly(fs_info, disk_bytenr)) 7214 + ret = 0; 7215 + if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) 7277 7216 goto out; 7278 7217 7279 - num_bytes = min(offset + *len, extent_end) - offset; 7280 - if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7218 + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && 7219 + found_type == BTRFS_FILE_EXTENT_PREALLOC) { 7281 7220 u64 range_end; 7282 7221 7283 - range_end = round_up(offset + num_bytes, 7222 + range_end = round_up(offset + nocow_args.num_bytes, 7284 7223 root->fs_info->sectorsize) - 1; 7285 7224 ret = test_range_bit(io_tree, offset, range_end, 7286 7225 EXTENT_DELALLOC, 0, NULL); ··· 7273 7246 } 7274 7247 } 7275 7248 7276 - btrfs_release_path(path); 7249 + if (orig_start) 7250 + *orig_start = key.offset - nocow_args.extent_offset; 7251 + if (orig_block_len) 7252 + *orig_block_len = nocow_args.disk_num_bytes; 7277 7253 7278 - /* 7279 - * look for other files referencing this extent, if we 7280 - * find any we must cow 7281 - */ 7282 - 7283 - ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), 7284 - key.offset - backref_offset, disk_bytenr, 7285 - strict); 7286 - if (ret) { 7287 - ret = 0; 7288 - goto out; 7289 - } 7290 - 7291 - /* 7292 - * adjust disk_bytenr and num_bytes to cover just the bytes 7293 - * in this extent we are about to write. If there 7294 - * are any csums in that range we have to cow in order 7295 - * to keep the csums correct 7296 - */ 7297 - disk_bytenr += backref_offset; 7298 - disk_bytenr += offset - key.offset; 7299 - if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) 7300 - goto out; 7301 - /* 7302 - * all of the above have passed, it is safe to overwrite this extent 7303 - * without cow 7304 - */ 7305 - *len = num_bytes; 7254 + *len = nocow_args.num_bytes; 7306 7255 ret = 1; 7307 7256 out: 7308 7257 btrfs_free_path(path); ··· 7286 7283 } 7287 7284 7288 7285 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, 7289 - struct extent_state **cached_state, bool writing) 7286 + struct extent_state **cached_state, 7287 + unsigned int iomap_flags) 7290 7288 { 7289 + const bool writing = (iomap_flags & IOMAP_WRITE); 7290 + const bool nowait = (iomap_flags & IOMAP_NOWAIT); 7291 + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 7291 7292 struct btrfs_ordered_extent *ordered; 7292 7293 int ret = 0; 7293 7294 7294 7295 while (1) { 7295 - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7296 - cached_state); 7296 + if (nowait) { 7297 + if (!try_lock_extent(io_tree, lockstart, lockend)) 7298 + return -EAGAIN; 7299 + } else { 7300 + lock_extent_bits(io_tree, lockstart, lockend, cached_state); 7301 + } 7297 7302 /* 7298 7303 * We're concerned with the entire range that we're going to be 7299 7304 * doing DIO to, so we need to make sure there's no ordered ··· 7322 7311 lockstart, lockend))) 7323 7312 break; 7324 7313 7325 - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7326 - cached_state); 7314 + unlock_extent_cached(io_tree, lockstart, lockend, cached_state); 7327 7315 7328 7316 if (ordered) { 7317 + if (nowait) { 7318 + btrfs_put_ordered_extent(ordered); 7319 + ret = -EAGAIN; 7320 + break; 7321 + } 7329 7322 /* 7330 7323 * If we are doing a DIO read and the ordered extent we 7331 7324 * found is for a buffered write, we can not wait for it ··· 7349 7334 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 7350 7335 btrfs_start_ordered_extent(ordered, 1); 7351 7336 else 7352 - ret = -ENOTBLK; 7337 + ret = nowait ? -EAGAIN : -ENOTBLK; 7353 7338 btrfs_put_ordered_extent(ordered); 7354 7339 } else { 7355 7340 /* ··· 7365 7350 * ordered extent to complete while holding a lock on 7366 7351 * that page. 7367 7352 */ 7368 - ret = -ENOTBLK; 7353 + ret = nowait ? -EAGAIN : -ENOTBLK; 7369 7354 } 7370 7355 7371 7356 if (ret) ··· 7439 7424 static int btrfs_get_blocks_direct_write(struct extent_map **map, 7440 7425 struct inode *inode, 7441 7426 struct btrfs_dio_data *dio_data, 7442 - u64 start, u64 len) 7427 + u64 start, u64 len, 7428 + unsigned int iomap_flags) 7443 7429 { 7430 + const bool nowait = (iomap_flags & IOMAP_NOWAIT); 7444 7431 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7445 7432 struct extent_map *em = *map; 7446 7433 int type; 7447 7434 u64 block_start, orig_start, orig_block_len, ram_bytes; 7435 + struct btrfs_block_group *bg; 7448 7436 bool can_nocow = false; 7449 7437 bool space_reserved = false; 7450 7438 u64 prev_len; ··· 7473 7455 block_start = em->block_start + (start - em->start); 7474 7456 7475 7457 if (can_nocow_extent(inode, start, &len, &orig_start, 7476 - &orig_block_len, &ram_bytes, false) == 1 && 7477 - btrfs_inc_nocow_writers(fs_info, block_start)) 7478 - can_nocow = true; 7458 + &orig_block_len, &ram_bytes, false) == 1) { 7459 + bg = btrfs_inc_nocow_writers(fs_info, block_start); 7460 + if (bg) 7461 + can_nocow = true; 7462 + } 7479 7463 } 7480 7464 7481 7465 prev_len = len; ··· 7485 7465 struct extent_map *em2; 7486 7466 7487 7467 /* We can NOCOW, so only need to reserve metadata space. */ 7488 - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); 7468 + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 7469 + nowait); 7489 7470 if (ret < 0) { 7490 7471 /* Our caller expects us to free the input extent map. */ 7491 7472 free_extent_map(em); 7492 7473 *map = NULL; 7493 - btrfs_dec_nocow_writers(fs_info, block_start); 7474 + btrfs_dec_nocow_writers(bg); 7475 + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) 7476 + ret = -EAGAIN; 7494 7477 goto out; 7495 7478 } 7496 7479 space_reserved = true; ··· 7502 7479 orig_start, block_start, 7503 7480 len, orig_block_len, 7504 7481 ram_bytes, type); 7505 - btrfs_dec_nocow_writers(fs_info, block_start); 7482 + btrfs_dec_nocow_writers(bg); 7506 7483 if (type == BTRFS_ORDERED_PREALLOC) { 7507 7484 free_extent_map(em); 7508 7485 *map = em = em2; ··· 7512 7489 ret = PTR_ERR(em2); 7513 7490 goto out; 7514 7491 } 7492 + 7493 + dio_data->nocow_done = true; 7515 7494 } else { 7516 7495 /* Our caller expects us to free the input extent map. */ 7517 7496 free_extent_map(em); 7518 7497 *map = NULL; 7519 7498 7520 - /* We have to COW, so need to reserve metadata and data space. */ 7521 - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), 7522 - &dio_data->data_reserved, 7523 - start, len); 7499 + if (nowait) 7500 + return -EAGAIN; 7501 + 7502 + /* 7503 + * If we could not allocate data space before locking the file 7504 + * range and we can't do a NOCOW write, then we have to fail. 7505 + */ 7506 + if (!dio_data->data_space_reserved) 7507 + return -ENOSPC; 7508 + 7509 + /* 7510 + * We have to COW and we have already reserved data space before, 7511 + * so now we reserve only metadata. 7512 + */ 7513 + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, 7514 + false); 7524 7515 if (ret < 0) 7525 7516 goto out; 7526 7517 space_reserved = true; ··· 7547 7510 *map = em; 7548 7511 len = min(len, em->len - (start - em->start)); 7549 7512 if (len < prev_len) 7550 - btrfs_delalloc_release_space(BTRFS_I(inode), 7551 - dio_data->data_reserved, 7552 - start + len, prev_len - len, 7553 - true); 7513 + btrfs_delalloc_release_metadata(BTRFS_I(inode), 7514 + prev_len - len, true); 7554 7515 } 7555 7516 7556 7517 /* ··· 7566 7531 out: 7567 7532 if (ret && space_reserved) { 7568 7533 btrfs_delalloc_release_extents(BTRFS_I(inode), len); 7569 - if (can_nocow) { 7570 - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); 7571 - } else { 7572 - btrfs_delalloc_release_space(BTRFS_I(inode), 7573 - dio_data->data_reserved, 7574 - start, len, true); 7575 - extent_changeset_free(dio_data->data_reserved); 7576 - dio_data->data_reserved = NULL; 7577 - } 7534 + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); 7578 7535 } 7579 7536 return ret; 7580 7537 } ··· 7575 7548 loff_t length, unsigned int flags, struct iomap *iomap, 7576 7549 struct iomap *srcmap) 7577 7550 { 7551 + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 7578 7552 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7579 7553 struct extent_map *em; 7580 7554 struct extent_state *cached_state = NULL; 7581 - struct btrfs_dio_data *dio_data = NULL; 7555 + struct btrfs_dio_data *dio_data = iter->private; 7582 7556 u64 lockstart, lockend; 7583 7557 const bool write = !!(flags & IOMAP_WRITE); 7584 7558 int ret = 0; 7585 7559 u64 len = length; 7560 + const u64 data_alloc_len = length; 7586 7561 bool unlock_extents = false; 7587 7562 7588 7563 if (!write) ··· 7594 7565 lockend = start + len - 1; 7595 7566 7596 7567 /* 7597 - * The generic stuff only does filemap_write_and_wait_range, which 7598 - * isn't enough if we've written compressed pages to this area, so we 7599 - * need to flush the dirty pages again to make absolutely sure that any 7600 - * outstanding dirty pages are on disk. 7568 + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't 7569 + * enough if we've written compressed pages to this area, so we need to 7570 + * flush the dirty pages again to make absolutely sure that any 7571 + * outstanding dirty pages are on disk - the first flush only starts 7572 + * compression on the data, while keeping the pages locked, so by the 7573 + * time the second flush returns we know bios for the compressed pages 7574 + * were submitted and finished, and the pages no longer under writeback. 7575 + * 7576 + * If we have a NOWAIT request and we have any pages in the range that 7577 + * are locked, likely due to compression still in progress, we don't want 7578 + * to block on page locks. We also don't want to block on pages marked as 7579 + * dirty or under writeback (same as for the non-compression case). 7580 + * iomap_dio_rw() did the same check, but after that and before we got 7581 + * here, mmap'ed writes may have happened or buffered reads started 7582 + * (readpage() and readahead(), which lock pages), as we haven't locked 7583 + * the file range yet. 7601 7584 */ 7602 7585 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7603 7586 &BTRFS_I(inode)->runtime_flags)) { 7604 - ret = filemap_fdatawrite_range(inode->i_mapping, start, 7605 - start + length - 1); 7606 - if (ret) 7607 - return ret; 7587 + if (flags & IOMAP_NOWAIT) { 7588 + if (filemap_range_needs_writeback(inode->i_mapping, 7589 + lockstart, lockend)) 7590 + return -EAGAIN; 7591 + } else { 7592 + ret = filemap_fdatawrite_range(inode->i_mapping, start, 7593 + start + length - 1); 7594 + if (ret) 7595 + return ret; 7596 + } 7608 7597 } 7609 7598 7610 - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); 7611 - if (!dio_data) 7612 - return -ENOMEM; 7599 + memset(dio_data, 0, sizeof(*dio_data)); 7613 7600 7614 - iomap->private = dio_data; 7615 - 7601 + /* 7602 + * We always try to allocate data space and must do it before locking 7603 + * the file range, to avoid deadlocks with concurrent writes to the same 7604 + * range if the range has several extents and the writes don't expand the 7605 + * current i_size (the inode lock is taken in shared mode). If we fail to 7606 + * allocate data space here we continue and later, after locking the 7607 + * file range, we fail with ENOSPC only if we figure out we can not do a 7608 + * NOCOW write. 7609 + */ 7610 + if (write && !(flags & IOMAP_NOWAIT)) { 7611 + ret = btrfs_check_data_free_space(BTRFS_I(inode), 7612 + &dio_data->data_reserved, 7613 + start, data_alloc_len); 7614 + if (!ret) 7615 + dio_data->data_space_reserved = true; 7616 + else if (ret && !(BTRFS_I(inode)->flags & 7617 + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 7618 + goto err; 7619 + } 7616 7620 7617 7621 /* 7618 7622 * If this errors out it's because we couldn't invalidate pagecache for 7619 - * this range and we need to fallback to buffered. 7623 + * this range and we need to fallback to buffered IO, or we are doing a 7624 + * NOWAIT read/write and we need to block. 7620 7625 */ 7621 - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { 7622 - ret = -ENOTBLK; 7626 + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); 7627 + if (ret < 0) 7623 7628 goto err; 7624 - } 7625 7629 7626 7630 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); 7627 7631 if (IS_ERR(em)) { ··· 7714 7652 7715 7653 if (write) { 7716 7654 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, 7717 - start, len); 7655 + start, len, flags); 7718 7656 if (ret < 0) 7719 7657 goto unlock_err; 7720 7658 unlock_extents = true; 7721 7659 /* Recalc len in case the new em is smaller than requested */ 7722 7660 len = min(len, em->len - (start - em->start)); 7661 + if (dio_data->data_space_reserved) { 7662 + u64 release_offset; 7663 + u64 release_len = 0; 7664 + 7665 + if (dio_data->nocow_done) { 7666 + release_offset = start; 7667 + release_len = data_alloc_len; 7668 + } else if (len < data_alloc_len) { 7669 + release_offset = start + len; 7670 + release_len = data_alloc_len - len; 7671 + } 7672 + 7673 + if (release_len > 0) 7674 + btrfs_free_reserved_data_space(BTRFS_I(inode), 7675 + dio_data->data_reserved, 7676 + release_offset, 7677 + release_len); 7678 + } 7723 7679 } else { 7724 7680 /* 7725 7681 * We need to unlock only the end area that we aren't using. ··· 7782 7702 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 7783 7703 &cached_state); 7784 7704 err: 7785 - kfree(dio_data); 7705 + if (dio_data->data_space_reserved) { 7706 + btrfs_free_reserved_data_space(BTRFS_I(inode), 7707 + dio_data->data_reserved, 7708 + start, data_alloc_len); 7709 + extent_changeset_free(dio_data->data_reserved); 7710 + } 7786 7711 7787 7712 return ret; 7788 7713 } ··· 7795 7710 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, 7796 7711 ssize_t written, unsigned int flags, struct iomap *iomap) 7797 7712 { 7798 - int ret = 0; 7799 - struct btrfs_dio_data *dio_data = iomap->private; 7713 + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); 7714 + struct btrfs_dio_data *dio_data = iter->private; 7800 7715 size_t submitted = dio_data->submitted; 7801 7716 const bool write = !!(flags & IOMAP_WRITE); 7717 + int ret = 0; 7802 7718 7803 7719 if (!write && (iomap->type == IOMAP_HOLE)) { 7804 7720 /* If reading from a hole, unlock and return */ 7805 7721 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); 7806 - goto out; 7722 + return 0; 7807 7723 } 7808 7724 7809 7725 if (submitted < length) { ··· 7821 7735 7822 7736 if (write) 7823 7737 extent_changeset_free(dio_data->data_reserved); 7824 - out: 7825 - kfree(dio_data); 7826 - iomap->private = NULL; 7827 - 7828 7738 return ret; 7829 7739 } 7830 7740 ··· 7833 7751 if (!refcount_dec_and_test(&dip->refs)) 7834 7752 return; 7835 7753 7836 - if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { 7754 + if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { 7837 7755 __endio_write_update_ordered(BTRFS_I(dip->inode), 7838 7756 dip->file_offset, 7839 7757 dip->bytes, 7840 - !dip->dio_bio->bi_status); 7758 + !dip->bio.bi_status); 7841 7759 } else { 7842 7760 unlock_extent(&BTRFS_I(dip->inode)->io_tree, 7843 7761 dip->file_offset, 7844 7762 dip->file_offset + dip->bytes - 1); 7845 7763 } 7846 7764 7847 - bio_endio(dip->dio_bio); 7848 - kfree(dip); 7765 + kfree(dip->csums); 7766 + bio_endio(&dip->bio); 7849 7767 } 7850 7768 7851 - static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, 7852 - int mirror_num, 7853 - unsigned long bio_flags) 7769 + static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, 7770 + int mirror_num, 7771 + enum btrfs_compression_type compress_type) 7854 7772 { 7855 7773 struct btrfs_dio_private *dip = bio->bi_private; 7856 7774 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 7857 - blk_status_t ret; 7858 7775 7859 7776 BUG_ON(bio_op(bio) == REQ_OP_WRITE); 7860 7777 7861 - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 7862 - if (ret) 7863 - return ret; 7778 + if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) 7779 + return; 7864 7780 7865 7781 refcount_inc(&dip->refs); 7866 - ret = btrfs_map_bio(fs_info, bio, mirror_num); 7867 - if (ret) 7782 + if (btrfs_map_bio(fs_info, bio, mirror_num)) 7868 7783 refcount_dec(&dip->refs); 7869 - return ret; 7870 7784 } 7871 7785 7872 7786 static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, ··· 7947 7869 err = btrfs_check_read_dio_bio(dip, bbio, !err); 7948 7870 7949 7871 if (err) 7950 - dip->dio_bio->bi_status = err; 7872 + dip->bio.bi_status = err; 7951 7873 7952 7874 btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); 7953 7875 ··· 7977 7899 goto map; 7978 7900 7979 7901 if (write && async_submit) { 7980 - ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, 7902 + ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, 7981 7903 btrfs_submit_bio_start_direct_io); 7982 7904 goto err; 7983 7905 } else if (write) { ··· 8002 7924 return ret; 8003 7925 } 8004 7926 8005 - /* 8006 - * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked 8007 - * or ordered extents whether or not we submit any bios. 8008 - */ 8009 - static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, 8010 - struct inode *inode, 8011 - loff_t file_offset) 8012 - { 8013 - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); 8014 - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); 8015 - size_t dip_size; 8016 - struct btrfs_dio_private *dip; 8017 - 8018 - dip_size = sizeof(*dip); 8019 - if (!write && csum) { 8020 - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8021 - size_t nblocks; 8022 - 8023 - nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; 8024 - dip_size += fs_info->csum_size * nblocks; 8025 - } 8026 - 8027 - dip = kzalloc(dip_size, GFP_NOFS); 8028 - if (!dip) 8029 - return NULL; 8030 - 8031 - dip->inode = inode; 8032 - dip->file_offset = file_offset; 8033 - dip->bytes = dio_bio->bi_iter.bi_size; 8034 - dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; 8035 - dip->dio_bio = dio_bio; 8036 - refcount_set(&dip->refs, 1); 8037 - return dip; 8038 - } 8039 - 8040 7927 static void btrfs_submit_direct(const struct iomap_iter *iter, 8041 7928 struct bio *dio_bio, loff_t file_offset) 8042 7929 { 7930 + struct btrfs_dio_private *dip = 7931 + container_of(dio_bio, struct btrfs_dio_private, bio); 8043 7932 struct inode *inode = iter->inode; 8044 7933 const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); 8045 7934 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8046 7935 const bool raid56 = (btrfs_data_alloc_profile(fs_info) & 8047 7936 BTRFS_BLOCK_GROUP_RAID56_MASK); 8048 - struct btrfs_dio_private *dip; 8049 7937 struct bio *bio; 8050 7938 u64 start_sector; 8051 7939 int async_submit = 0; ··· 8022 7978 int ret; 8023 7979 blk_status_t status; 8024 7980 struct btrfs_io_geometry geom; 8025 - struct btrfs_dio_data *dio_data = iter->iomap.private; 7981 + struct btrfs_dio_data *dio_data = iter->private; 8026 7982 struct extent_map *em = NULL; 8027 7983 8028 - dip = btrfs_create_dio_private(dio_bio, inode, file_offset); 8029 - if (!dip) { 8030 - if (!write) { 8031 - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, 8032 - file_offset + dio_bio->bi_iter.bi_size - 1); 8033 - } 8034 - dio_bio->bi_status = BLK_STS_RESOURCE; 8035 - bio_endio(dio_bio); 8036 - return; 8037 - } 7984 + dip->inode = inode; 7985 + dip->file_offset = file_offset; 7986 + dip->bytes = dio_bio->bi_iter.bi_size; 7987 + refcount_set(&dip->refs, 1); 7988 + dip->csums = NULL; 8038 7989 8039 - if (!write) { 7990 + if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 7991 + unsigned int nr_sectors = 7992 + (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); 7993 + 8040 7994 /* 8041 7995 * Load the csums up front to reduce csum tree searches and 8042 7996 * contention when submitting bios. 8043 - * 8044 - * If we have csums disabled this will do nothing. 8045 7997 */ 7998 + status = BLK_STS_RESOURCE; 7999 + dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); 8000 + if (!dip) 8001 + goto out_err; 8002 + 8046 8003 status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); 8047 8004 if (status != BLK_STS_OK) 8048 8005 goto out_err; ··· 8133 8088 out_err_em: 8134 8089 free_extent_map(em); 8135 8090 out_err: 8136 - dip->dio_bio->bi_status = status; 8091 + dio_bio->bi_status = status; 8137 8092 btrfs_dio_private_put(dip); 8138 8093 } 8139 8094 8140 - const struct iomap_ops btrfs_dio_iomap_ops = { 8095 + static const struct iomap_ops btrfs_dio_iomap_ops = { 8141 8096 .iomap_begin = btrfs_dio_iomap_begin, 8142 8097 .iomap_end = btrfs_dio_iomap_end, 8143 8098 }; 8144 8099 8145 - const struct iomap_dio_ops btrfs_dio_ops = { 8100 + static const struct iomap_dio_ops btrfs_dio_ops = { 8146 8101 .submit_io = btrfs_submit_direct, 8102 + .bio_set = &btrfs_dio_bioset, 8147 8103 }; 8104 + 8105 + ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) 8106 + { 8107 + struct btrfs_dio_data data; 8108 + 8109 + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 8110 + IOMAP_DIO_PARTIAL, &data, done_before); 8111 + } 8148 8112 8149 8113 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 8150 8114 u64 start, u64 len) ··· 8165 8111 return ret; 8166 8112 8167 8113 return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); 8168 - } 8169 - 8170 - int btrfs_readpage(struct file *file, struct page *page) 8171 - { 8172 - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8173 - u64 start = page_offset(page); 8174 - u64 end = start + PAGE_SIZE - 1; 8175 - struct btrfs_bio_ctrl bio_ctrl = { 0 }; 8176 - int ret; 8177 - 8178 - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 8179 - 8180 - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 8181 - if (bio_ctrl.bio) { 8182 - int ret2; 8183 - 8184 - ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); 8185 - if (ret == 0) 8186 - ret = ret2; 8187 - } 8188 - return ret; 8189 8114 } 8190 8115 8191 8116 static int btrfs_writepage(struct page *page, struct writeback_control *wbc) ··· 8215 8182 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 8216 8183 struct btrfs_subpage *subpage; 8217 8184 8218 - if (fs_info->sectorsize == PAGE_SIZE) 8185 + if (!btrfs_is_subpage(fs_info, page)) 8219 8186 return; 8220 8187 8221 8188 ASSERT(PagePrivate(page) && page->private); ··· 8797 8764 return ret; 8798 8765 } 8799 8766 8800 - /* 8801 - * create a new subvolume directory/inode (helper for the ioctl). 8802 - */ 8803 - int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 8804 - struct btrfs_root *new_root, 8805 - struct btrfs_root *parent_root, 8806 - struct user_namespace *mnt_userns) 8767 + struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, 8768 + struct inode *dir) 8807 8769 { 8808 8770 struct inode *inode; 8809 - int err; 8810 - u64 index = 0; 8811 - u64 ino; 8812 8771 8813 - err = btrfs_get_free_objectid(new_root, &ino); 8814 - if (err < 0) 8815 - return err; 8816 - 8817 - inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, 8818 - ino, ino, 8819 - S_IFDIR | (~current_umask() & S_IRWXUGO), 8820 - &index); 8821 - if (IS_ERR(inode)) 8822 - return PTR_ERR(inode); 8823 - inode->i_op = &btrfs_dir_inode_operations; 8824 - inode->i_fop = &btrfs_dir_file_operations; 8825 - 8826 - set_nlink(inode, 1); 8827 - btrfs_i_size_write(BTRFS_I(inode), 0); 8828 - unlock_new_inode(inode); 8829 - 8830 - err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8831 - if (err) 8832 - btrfs_err(new_root->fs_info, 8833 - "error inheriting subvolume %llu properties: %d", 8834 - new_root->root_key.objectid, err); 8835 - 8836 - err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); 8837 - 8838 - iput(inode); 8839 - return err; 8772 + inode = new_inode(dir->i_sb); 8773 + if (inode) { 8774 + /* 8775 + * Subvolumes don't inherit the sgid bit or the parent's gid if 8776 + * the parent's sgid bit is set. This is probably a bug. 8777 + */ 8778 + inode_init_owner(mnt_userns, inode, NULL, 8779 + S_IFDIR | (~current_umask() & S_IRWXUGO)); 8780 + inode->i_op = &btrfs_dir_inode_operations; 8781 + inode->i_fop = &btrfs_dir_file_operations; 8782 + } 8783 + return inode; 8840 8784 } 8841 8785 8842 8786 struct inode *btrfs_alloc_inode(struct super_block *sb) ··· 8953 8943 8954 8944 static void init_once(void *foo) 8955 8945 { 8956 - struct btrfs_inode *ei = (struct btrfs_inode *) foo; 8946 + struct btrfs_inode *ei = foo; 8957 8947 8958 8948 inode_init_once(&ei->vfs_inode); 8959 8949 } ··· 8965 8955 * destroy cache. 8966 8956 */ 8967 8957 rcu_barrier(); 8958 + bioset_exit(&btrfs_dio_bioset); 8968 8959 kmem_cache_destroy(btrfs_inode_cachep); 8969 8960 kmem_cache_destroy(btrfs_trans_handle_cachep); 8970 8961 kmem_cache_destroy(btrfs_path_cachep); ··· 9004 8993 PAGE_SIZE, PAGE_SIZE, 9005 8994 SLAB_MEM_SPREAD, NULL); 9006 8995 if (!btrfs_free_space_bitmap_cachep) 8996 + goto fail; 8997 + 8998 + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, 8999 + offsetof(struct btrfs_dio_private, bio), 9000 + BIOSET_NEED_BVECS)) 9007 9001 goto fail; 9008 9002 9009 9003 return 0; ··· 9066 9050 { 9067 9051 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9068 9052 struct btrfs_trans_handle *trans; 9053 + unsigned int trans_num_items; 9069 9054 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9070 9055 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9071 9056 struct inode *new_inode = new_dentry->d_inode; ··· 9098 9081 down_read(&fs_info->subvol_sem); 9099 9082 9100 9083 /* 9101 - * We want to reserve the absolute worst case amount of items. So if 9102 - * both inodes are subvols and we need to unlink them then that would 9103 - * require 4 item modifications, but if they are both normal inodes it 9104 - * would require 5 item modifications, so we'll assume their normal 9105 - * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items 9106 - * should cover the worst case number of items we'll modify. 9084 + * For each inode: 9085 + * 1 to remove old dir item 9086 + * 1 to remove old dir index 9087 + * 1 to add new dir item 9088 + * 1 to add new dir index 9089 + * 1 to update parent inode 9090 + * 9091 + * If the parents are the same, we only need to account for one 9107 9092 */ 9108 - trans = btrfs_start_transaction(root, 12); 9093 + trans_num_items = (old_dir == new_dir ? 9 : 10); 9094 + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9095 + /* 9096 + * 1 to remove old root ref 9097 + * 1 to remove old root backref 9098 + * 1 to add new root ref 9099 + * 1 to add new root backref 9100 + */ 9101 + trans_num_items += 4; 9102 + } else { 9103 + /* 9104 + * 1 to update inode item 9105 + * 1 to remove old inode ref 9106 + * 1 to add new inode ref 9107 + */ 9108 + trans_num_items += 3; 9109 + } 9110 + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) 9111 + trans_num_items += 4; 9112 + else 9113 + trans_num_items += 3; 9114 + trans = btrfs_start_transaction(root, trans_num_items); 9109 9115 if (IS_ERR(trans)) { 9110 9116 ret = PTR_ERR(trans); 9111 9117 goto out_notrans; ··· 9295 9255 return ret; 9296 9256 } 9297 9257 9298 - static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, 9299 - struct btrfs_root *root, 9300 - struct user_namespace *mnt_userns, 9301 - struct inode *dir, 9302 - struct dentry *dentry) 9258 + static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, 9259 + struct inode *dir) 9303 9260 { 9304 - int ret; 9305 9261 struct inode *inode; 9306 - u64 objectid; 9307 - u64 index; 9308 9262 9309 - ret = btrfs_get_free_objectid(root, &objectid); 9310 - if (ret) 9311 - return ret; 9312 - 9313 - inode = btrfs_new_inode(trans, root, mnt_userns, dir, 9314 - dentry->d_name.name, 9315 - dentry->d_name.len, 9316 - btrfs_ino(BTRFS_I(dir)), 9317 - objectid, 9318 - S_IFCHR | WHITEOUT_MODE, 9319 - &index); 9320 - 9321 - if (IS_ERR(inode)) { 9322 - ret = PTR_ERR(inode); 9323 - return ret; 9263 + inode = new_inode(dir->i_sb); 9264 + if (inode) { 9265 + inode_init_owner(mnt_userns, inode, dir, 9266 + S_IFCHR | WHITEOUT_MODE); 9267 + inode->i_op = &btrfs_special_inode_operations; 9268 + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); 9324 9269 } 9325 - 9326 - inode->i_op = &btrfs_special_inode_operations; 9327 - init_special_inode(inode, inode->i_mode, 9328 - WHITEOUT_DEV); 9329 - 9330 - ret = btrfs_init_inode_security(trans, inode, dir, 9331 - &dentry->d_name); 9332 - if (ret) 9333 - goto out; 9334 - 9335 - ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9336 - BTRFS_I(inode), 0, index); 9337 - if (ret) 9338 - goto out; 9339 - 9340 - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9341 - out: 9342 - unlock_new_inode(inode); 9343 - if (ret) 9344 - inode_dec_link_count(inode); 9345 - iput(inode); 9346 - 9347 - return ret; 9270 + return inode; 9348 9271 } 9349 9272 9350 9273 static int btrfs_rename(struct user_namespace *mnt_userns, ··· 9316 9313 unsigned int flags) 9317 9314 { 9318 9315 struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); 9316 + struct btrfs_new_inode_args whiteout_args = { 9317 + .dir = old_dir, 9318 + .dentry = old_dentry, 9319 + }; 9319 9320 struct btrfs_trans_handle *trans; 9320 9321 unsigned int trans_num_items; 9321 9322 struct btrfs_root *root = BTRFS_I(old_dir)->root; ··· 9374 9367 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) 9375 9368 filemap_flush(old_inode->i_mapping); 9376 9369 9377 - /* close the racy window with snapshot create/destroy ioctl */ 9378 - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9370 + if (flags & RENAME_WHITEOUT) { 9371 + whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); 9372 + if (!whiteout_args.inode) 9373 + return -ENOMEM; 9374 + ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); 9375 + if (ret) 9376 + goto out_whiteout_inode; 9377 + } else { 9378 + /* 1 to update the old parent inode. */ 9379 + trans_num_items = 1; 9380 + } 9381 + 9382 + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { 9383 + /* Close the race window with snapshot create/destroy ioctl */ 9379 9384 down_read(&fs_info->subvol_sem); 9385 + /* 9386 + * 1 to remove old root ref 9387 + * 1 to remove old root backref 9388 + * 1 to add new root ref 9389 + * 1 to add new root backref 9390 + */ 9391 + trans_num_items += 4; 9392 + } else { 9393 + /* 9394 + * 1 to update inode 9395 + * 1 to remove old inode ref 9396 + * 1 to add new inode ref 9397 + */ 9398 + trans_num_items += 3; 9399 + } 9380 9400 /* 9381 - * We want to reserve the absolute worst case amount of items. So if 9382 - * both inodes are subvols and we need to unlink them then that would 9383 - * require 4 item modifications, but if they are both normal inodes it 9384 - * would require 5 item modifications, so we'll assume they are normal 9385 - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9386 - * should cover the worst case number of items we'll modify. 9387 - * If our rename has the whiteout flag, we need more 5 units for the 9388 - * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item 9389 - * when selinux is enabled). 9401 + * 1 to remove old dir item 9402 + * 1 to remove old dir index 9403 + * 1 to add new dir item 9404 + * 1 to add new dir index 9390 9405 */ 9391 - trans_num_items = 11; 9392 - if (flags & RENAME_WHITEOUT) 9406 + trans_num_items += 4; 9407 + /* 1 to update new parent inode if it's not the same as the old parent */ 9408 + if (new_dir != old_dir) 9409 + trans_num_items++; 9410 + if (new_inode) { 9411 + /* 9412 + * 1 to update inode 9413 + * 1 to remove inode ref 9414 + * 1 to remove dir item 9415 + * 1 to remove dir index 9416 + * 1 to possibly add orphan item 9417 + */ 9393 9418 trans_num_items += 5; 9419 + } 9394 9420 trans = btrfs_start_transaction(root, trans_num_items); 9395 9421 if (IS_ERR(trans)) { 9396 9422 ret = PTR_ERR(trans); ··· 9519 9479 rename_ctx.index, new_dentry->d_parent); 9520 9480 9521 9481 if (flags & RENAME_WHITEOUT) { 9522 - ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, 9523 - old_dir, old_dentry); 9524 - 9482 + ret = btrfs_create_new_inode(trans, &whiteout_args); 9525 9483 if (ret) { 9526 9484 btrfs_abort_transaction(trans, ret); 9527 9485 goto out_fail; 9486 + } else { 9487 + unlock_new_inode(whiteout_args.inode); 9488 + iput(whiteout_args.inode); 9489 + whiteout_args.inode = NULL; 9528 9490 } 9529 9491 } 9530 9492 out_fail: ··· 9535 9493 out_notrans: 9536 9494 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9537 9495 up_read(&fs_info->subvol_sem); 9538 - 9496 + if (flags & RENAME_WHITEOUT) 9497 + btrfs_new_inode_args_destroy(&whiteout_args); 9498 + out_whiteout_inode: 9499 + if (flags & RENAME_WHITEOUT) 9500 + iput(whiteout_args.inode); 9539 9501 return ret; 9540 9502 } 9541 9503 ··· 9758 9712 struct btrfs_root *root = BTRFS_I(dir)->root; 9759 9713 struct btrfs_path *path; 9760 9714 struct btrfs_key key; 9761 - struct inode *inode = NULL; 9715 + struct inode *inode; 9716 + struct btrfs_new_inode_args new_inode_args = { 9717 + .dir = dir, 9718 + .dentry = dentry, 9719 + }; 9720 + unsigned int trans_num_items; 9762 9721 int err; 9763 - u64 objectid; 9764 - u64 index = 0; 9765 9722 int name_len; 9766 9723 int datasize; 9767 9724 unsigned long ptr; ··· 9775 9726 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) 9776 9727 return -ENAMETOOLONG; 9777 9728 9778 - /* 9779 - * 2 items for inode item and ref 9780 - * 2 items for dir items 9781 - * 1 item for updating parent inode item 9782 - * 1 item for the inline extent item 9783 - * 1 item for xattr if selinux is on 9784 - */ 9785 - trans = btrfs_start_transaction(root, 7); 9786 - if (IS_ERR(trans)) 9787 - return PTR_ERR(trans); 9729 + inode = new_inode(dir->i_sb); 9730 + if (!inode) 9731 + return -ENOMEM; 9732 + inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); 9733 + inode->i_op = &btrfs_symlink_inode_operations; 9734 + inode_nohighmem(inode); 9735 + inode->i_mapping->a_ops = &btrfs_aops; 9736 + btrfs_i_size_write(BTRFS_I(inode), name_len); 9737 + inode_set_bytes(inode, name_len); 9788 9738 9789 - err = btrfs_get_free_objectid(root, &objectid); 9739 + new_inode_args.inode = inode; 9740 + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 9790 9741 if (err) 9791 - goto out_unlock; 9742 + goto out_inode; 9743 + /* 1 additional item for the inline extent */ 9744 + trans_num_items++; 9792 9745 9793 - inode = btrfs_new_inode(trans, root, mnt_userns, dir, 9794 - dentry->d_name.name, dentry->d_name.len, 9795 - btrfs_ino(BTRFS_I(dir)), objectid, 9796 - S_IFLNK | S_IRWXUGO, &index); 9797 - if (IS_ERR(inode)) { 9798 - err = PTR_ERR(inode); 9799 - inode = NULL; 9800 - goto out_unlock; 9746 + trans = btrfs_start_transaction(root, trans_num_items); 9747 + if (IS_ERR(trans)) { 9748 + err = PTR_ERR(trans); 9749 + goto out_new_inode_args; 9801 9750 } 9802 9751 9803 - /* 9804 - * If the active LSM wants to access the inode during 9805 - * d_instantiate it needs these. Smack checks to see 9806 - * if the filesystem supports xattrs by looking at the 9807 - * ops vector. 9808 - */ 9809 - inode->i_fop = &btrfs_file_operations; 9810 - inode->i_op = &btrfs_file_inode_operations; 9811 - inode->i_mapping->a_ops = &btrfs_aops; 9812 - 9813 - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 9752 + err = btrfs_create_new_inode(trans, &new_inode_args); 9814 9753 if (err) 9815 - goto out_unlock; 9754 + goto out; 9816 9755 9817 9756 path = btrfs_alloc_path(); 9818 9757 if (!path) { 9819 9758 err = -ENOMEM; 9820 - goto out_unlock; 9759 + btrfs_abort_transaction(trans, err); 9760 + discard_new_inode(inode); 9761 + inode = NULL; 9762 + goto out; 9821 9763 } 9822 9764 key.objectid = btrfs_ino(BTRFS_I(inode)); 9823 9765 key.offset = 0; ··· 9817 9777 err = btrfs_insert_empty_item(trans, root, path, &key, 9818 9778 datasize); 9819 9779 if (err) { 9780 + btrfs_abort_transaction(trans, err); 9820 9781 btrfs_free_path(path); 9821 - goto out_unlock; 9782 + discard_new_inode(inode); 9783 + inode = NULL; 9784 + goto out; 9822 9785 } 9823 9786 leaf = path->nodes[0]; 9824 9787 ei = btrfs_item_ptr(leaf, path->slots[0], ··· 9839 9796 btrfs_mark_buffer_dirty(leaf); 9840 9797 btrfs_free_path(path); 9841 9798 9842 - inode->i_op = &btrfs_symlink_inode_operations; 9843 - inode_nohighmem(inode); 9844 - inode_set_bytes(inode, name_len); 9845 - btrfs_i_size_write(BTRFS_I(inode), name_len); 9846 - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); 9847 - /* 9848 - * Last step, add directory indexes for our symlink inode. This is the 9849 - * last step to avoid extra cleanup of these indexes if an error happens 9850 - * elsewhere above. 9851 - */ 9852 - if (!err) 9853 - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, 9854 - BTRFS_I(inode), 0, index); 9855 - if (err) 9856 - goto out_unlock; 9857 - 9858 9799 d_instantiate_new(dentry, inode); 9859 - 9860 - out_unlock: 9800 + err = 0; 9801 + out: 9861 9802 btrfs_end_transaction(trans); 9862 - if (err && inode) { 9863 - inode_dec_link_count(inode); 9864 - discard_new_inode(inode); 9865 - } 9866 9803 btrfs_btree_balance_dirty(fs_info); 9804 + out_new_inode_args: 9805 + btrfs_new_inode_args_destroy(&new_inode_args); 9806 + out_inode: 9807 + if (err) 9808 + iput(inode); 9867 9809 return err; 9868 9810 } 9869 9811 ··· 10099 10071 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 10100 10072 struct btrfs_trans_handle *trans; 10101 10073 struct btrfs_root *root = BTRFS_I(dir)->root; 10102 - struct inode *inode = NULL; 10103 - u64 objectid; 10104 - u64 index; 10105 - int ret = 0; 10074 + struct inode *inode; 10075 + struct btrfs_new_inode_args new_inode_args = { 10076 + .dir = dir, 10077 + .dentry = dentry, 10078 + .orphan = true, 10079 + }; 10080 + unsigned int trans_num_items; 10081 + int ret; 10106 10082 10107 - /* 10108 - * 5 units required for adding orphan entry 10109 - */ 10110 - trans = btrfs_start_transaction(root, 5); 10111 - if (IS_ERR(trans)) 10112 - return PTR_ERR(trans); 10113 - 10114 - ret = btrfs_get_free_objectid(root, &objectid); 10115 - if (ret) 10116 - goto out; 10117 - 10118 - inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, 10119 - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); 10120 - if (IS_ERR(inode)) { 10121 - ret = PTR_ERR(inode); 10122 - inode = NULL; 10123 - goto out; 10124 - } 10125 - 10083 + inode = new_inode(dir->i_sb); 10084 + if (!inode) 10085 + return -ENOMEM; 10086 + inode_init_owner(mnt_userns, inode, dir, mode); 10126 10087 inode->i_fop = &btrfs_file_operations; 10127 10088 inode->i_op = &btrfs_file_inode_operations; 10128 - 10129 10089 inode->i_mapping->a_ops = &btrfs_aops; 10130 10090 10131 - ret = btrfs_init_inode_security(trans, inode, dir, NULL); 10091 + new_inode_args.inode = inode; 10092 + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 10132 10093 if (ret) 10133 - goto out; 10094 + goto out_inode; 10134 10095 10135 - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 10136 - if (ret) 10137 - goto out; 10138 - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10139 - if (ret) 10140 - goto out; 10096 + trans = btrfs_start_transaction(root, trans_num_items); 10097 + if (IS_ERR(trans)) { 10098 + ret = PTR_ERR(trans); 10099 + goto out_new_inode_args; 10100 + } 10101 + 10102 + ret = btrfs_create_new_inode(trans, &new_inode_args); 10141 10103 10142 10104 /* 10143 - * We set number of links to 0 in btrfs_new_inode(), and here we set 10144 - * it to 1 because d_tmpfile() will issue a warning if the count is 0, 10145 - * through: 10105 + * We set number of links to 0 in btrfs_create_new_inode(), and here we 10106 + * set it to 1 because d_tmpfile() will issue a warning if the count is 10107 + * 0, through: 10146 10108 * 10147 10109 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() 10148 10110 */ 10149 10111 set_nlink(inode, 1); 10150 - d_tmpfile(dentry, inode); 10151 - unlock_new_inode(inode); 10152 - mark_inode_dirty(inode); 10153 - out: 10112 + 10113 + if (!ret) { 10114 + d_tmpfile(dentry, inode); 10115 + unlock_new_inode(inode); 10116 + mark_inode_dirty(inode); 10117 + } 10118 + 10154 10119 btrfs_end_transaction(trans); 10155 - if (ret && inode) 10156 - discard_new_inode(inode); 10157 10120 btrfs_btree_balance_dirty(fs_info); 10121 + out_new_inode_args: 10122 + btrfs_new_inode_args_destroy(&new_inode_args); 10123 + out_inode: 10124 + if (ret) 10125 + iput(inode); 10158 10126 return ret; 10159 10127 } 10160 10128 ··· 10482 10458 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 10483 10459 if (!pages) 10484 10460 return -ENOMEM; 10485 - for (i = 0; i < nr_pages; i++) { 10486 - pages[i] = alloc_page(GFP_NOFS); 10487 - if (!pages[i]) { 10488 - ret = -ENOMEM; 10489 - goto out; 10461 + ret = btrfs_alloc_page_array(nr_pages, pages); 10462 + if (ret) { 10463 + ret = -ENOMEM; 10464 + goto out; 10490 10465 } 10491 - } 10492 10466 10493 10467 ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, 10494 10468 disk_io_size, pages); ··· 10822 10800 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); 10823 10801 if (ret) 10824 10802 goto out_free_data_space; 10825 - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); 10803 + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, 10804 + false); 10826 10805 if (ret) 10827 10806 goto out_qgroup_free_data; 10828 10807 ··· 11330 11307 if (add_bytes > 0) 11331 11308 inode_add_bytes(&inode->vfs_inode, add_bytes); 11332 11309 spin_unlock(&inode->lock); 11310 + } 11311 + 11312 + /** 11313 + * Verify that there are no ordered extents for a given file range. 11314 + * 11315 + * @inode: The target inode. 11316 + * @start: Start offset of the file range, should be sector size aligned. 11317 + * @end: End offset (inclusive) of the file range, its value +1 should be 11318 + * sector size aligned. 11319 + * 11320 + * This should typically be used for cases where we locked an inode's VFS lock in 11321 + * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, 11322 + * we have flushed all delalloc in the range, we have waited for all ordered 11323 + * extents in the range to complete and finally we have locked the file range in 11324 + * the inode's io_tree. 11325 + */ 11326 + void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) 11327 + { 11328 + struct btrfs_root *root = inode->root; 11329 + struct btrfs_ordered_extent *ordered; 11330 + 11331 + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 11332 + return; 11333 + 11334 + ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); 11335 + if (ordered) { 11336 + btrfs_err(root->fs_info, 11337 + "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", 11338 + start, end, btrfs_ino(inode), root->root_key.objectid, 11339 + ordered->file_offset, 11340 + ordered->file_offset + ordered->num_bytes - 1); 11341 + btrfs_put_ordered_extent(ordered); 11342 + } 11343 + 11344 + ASSERT(ordered == NULL); 11333 11345 } 11334 11346 11335 11347 static const struct inode_operations btrfs_dir_inode_operations = {

+142 -138

fs/btrfs/ioctl.c

··· 540 540 return 1; 541 541 } 542 542 543 + /* 544 + * Calculate the number of transaction items to reserve for creating a subvolume 545 + * or snapshot, not including the inode, directory entries, or parent directory. 546 + */ 547 + static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) 548 + { 549 + /* 550 + * 1 to add root block 551 + * 1 to add root item 552 + * 1 to add root ref 553 + * 1 to add root backref 554 + * 1 to add UUID item 555 + * 1 to add qgroup info 556 + * 1 to add qgroup limit 557 + * 558 + * Ideally the last two would only be accounted if qgroups are enabled, 559 + * but that can change between now and the time we would insert them. 560 + */ 561 + unsigned int num_items = 7; 562 + 563 + if (inherit) { 564 + /* 2 to add qgroup relations for each inherited qgroup */ 565 + num_items += 2 * inherit->num_qgroups; 566 + } 567 + return num_items; 568 + } 569 + 543 570 static noinline int create_subvol(struct user_namespace *mnt_userns, 544 571 struct inode *dir, struct dentry *dentry, 545 - const char *name, int namelen, 546 572 struct btrfs_qgroup_inherit *inherit) 547 573 { 548 574 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); ··· 581 555 struct btrfs_root *new_root; 582 556 struct btrfs_block_rsv block_rsv; 583 557 struct timespec64 cur_time = current_time(dir); 584 - struct inode *inode; 558 + struct btrfs_new_inode_args new_inode_args = { 559 + .dir = dir, 560 + .dentry = dentry, 561 + .subvol = true, 562 + }; 563 + unsigned int trans_num_items; 585 564 int ret; 586 - dev_t anon_dev = 0; 565 + dev_t anon_dev; 587 566 u64 objectid; 588 - u64 index = 0; 589 567 590 568 root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); 591 569 if (!root_item) ··· 597 567 598 568 ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); 599 569 if (ret) 600 - goto fail_free; 601 - 602 - ret = get_anon_bdev(&anon_dev); 603 - if (ret < 0) 604 - goto fail_free; 570 + goto out_root_item; 605 571 606 572 /* 607 573 * Don't create subvolume whose level is not zero. Or qgroup will be ··· 605 579 */ 606 580 if (btrfs_qgroup_level(objectid)) { 607 581 ret = -ENOSPC; 608 - goto fail_free; 582 + goto out_root_item; 609 583 } 610 584 611 - btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 612 - /* 613 - * The same as the snapshot creation, please see the comment 614 - * of create_snapshot(). 615 - */ 616 - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); 585 + ret = get_anon_bdev(&anon_dev); 586 + if (ret < 0) 587 + goto out_root_item; 588 + 589 + new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); 590 + if (!new_inode_args.inode) { 591 + ret = -ENOMEM; 592 + goto out_anon_dev; 593 + } 594 + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); 617 595 if (ret) 618 - goto fail_free; 596 + goto out_inode; 597 + trans_num_items += create_subvol_num_items(inherit); 598 + 599 + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 600 + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 601 + trans_num_items, false); 602 + if (ret) 603 + goto out_new_inode_args; 619 604 620 605 trans = btrfs_start_transaction(root, 0); 621 606 if (IS_ERR(trans)) { 622 607 ret = PTR_ERR(trans); 623 608 btrfs_subvolume_release_metadata(root, &block_rsv); 624 - goto fail_free; 609 + goto out_new_inode_args; 625 610 } 626 611 trans->block_rsv = &block_rsv; 627 612 trans->bytes_reserved = block_rsv.size; 628 613 629 614 ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); 630 615 if (ret) 631 - goto fail; 616 + goto out; 632 617 633 618 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, 634 619 BTRFS_NESTING_NORMAL); 635 620 if (IS_ERR(leaf)) { 636 621 ret = PTR_ERR(leaf); 637 - goto fail; 622 + goto out; 638 623 } 639 624 640 625 btrfs_mark_buffer_dirty(leaf); ··· 700 663 btrfs_tree_unlock(leaf); 701 664 btrfs_free_tree_block(trans, objectid, leaf, 0, 1); 702 665 free_extent_buffer(leaf); 703 - goto fail; 666 + goto out; 704 667 } 705 668 706 669 free_extent_buffer(leaf); 707 670 leaf = NULL; 708 671 709 - key.offset = (u64)-1; 710 672 new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); 711 673 if (IS_ERR(new_root)) { 712 - free_anon_bdev(anon_dev); 713 674 ret = PTR_ERR(new_root); 714 675 btrfs_abort_transaction(trans, ret); 715 - goto fail; 676 + goto out; 716 677 } 717 - /* Freeing will be done in btrfs_put_root() of new_root */ 678 + /* anon_dev is owned by new_root now. */ 718 679 anon_dev = 0; 680 + BTRFS_I(new_inode_args.inode)->root = new_root; 681 + /* ... and new_root is owned by new_inode_args.inode now. */ 719 682 720 683 ret = btrfs_record_root_in_trans(trans, new_root); 721 684 if (ret) { 722 - btrfs_put_root(new_root); 723 685 btrfs_abort_transaction(trans, ret); 724 - goto fail; 725 - } 726 - 727 - ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); 728 - btrfs_put_root(new_root); 729 - if (ret) { 730 - /* We potentially lose an unused inode item here */ 731 - btrfs_abort_transaction(trans, ret); 732 - goto fail; 733 - } 734 - 735 - /* 736 - * insert the directory item 737 - */ 738 - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); 739 - if (ret) { 740 - btrfs_abort_transaction(trans, ret); 741 - goto fail; 742 - } 743 - 744 - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, 745 - BTRFS_FT_DIR, index); 746 - if (ret) { 747 - btrfs_abort_transaction(trans, ret); 748 - goto fail; 749 - } 750 - 751 - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); 752 - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 753 - if (ret) { 754 - btrfs_abort_transaction(trans, ret); 755 - goto fail; 756 - } 757 - 758 - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, 759 - btrfs_ino(BTRFS_I(dir)), index, name, namelen); 760 - if (ret) { 761 - btrfs_abort_transaction(trans, ret); 762 - goto fail; 686 + goto out; 763 687 } 764 688 765 689 ret = btrfs_uuid_tree_add(trans, root_item->uuid, 766 690 BTRFS_UUID_KEY_SUBVOL, objectid); 767 - if (ret) 691 + if (ret) { 768 692 btrfs_abort_transaction(trans, ret); 693 + goto out; 694 + } 769 695 770 - fail: 771 - kfree(root_item); 696 + ret = btrfs_create_new_inode(trans, &new_inode_args); 697 + if (ret) { 698 + btrfs_abort_transaction(trans, ret); 699 + goto out; 700 + } 701 + 702 + d_instantiate_new(dentry, new_inode_args.inode); 703 + new_inode_args.inode = NULL; 704 + 705 + out: 772 706 trans->block_rsv = NULL; 773 707 trans->bytes_reserved = 0; 774 708 btrfs_subvolume_release_metadata(root, &block_rsv); ··· 748 740 btrfs_end_transaction(trans); 749 741 else 750 742 ret = btrfs_commit_transaction(trans); 751 - 752 - if (!ret) { 753 - inode = btrfs_lookup_dentry(dir, dentry); 754 - if (IS_ERR(inode)) 755 - return PTR_ERR(inode); 756 - d_instantiate(dentry, inode); 757 - } 758 - return ret; 759 - 760 - fail_free: 743 + out_new_inode_args: 744 + btrfs_new_inode_args_destroy(&new_inode_args); 745 + out_inode: 746 + iput(new_inode_args.inode); 747 + out_anon_dev: 761 748 if (anon_dev) 762 749 free_anon_bdev(anon_dev); 750 + out_root_item: 763 751 kfree(root_item); 764 752 return ret; 765 753 } ··· 767 763 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); 768 764 struct inode *inode; 769 765 struct btrfs_pending_snapshot *pending_snapshot; 766 + unsigned int trans_num_items; 770 767 struct btrfs_trans_handle *trans; 771 768 int ret; 772 769 ··· 805 800 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 806 801 BTRFS_BLOCK_RSV_TEMP); 807 802 /* 808 - * 1 - parent dir inode 809 - * 2 - dir entries 810 - * 1 - root item 811 - * 2 - root ref/backref 812 - * 1 - root of snapshot 813 - * 1 - UUID item 803 + * 1 to add dir item 804 + * 1 to add dir index 805 + * 1 to update parent inode item 814 806 */ 807 + trans_num_items = create_subvol_num_items(inherit) + 3; 815 808 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, 816 - &pending_snapshot->block_rsv, 8, 817 - false); 809 + &pending_snapshot->block_rsv, 810 + trans_num_items, false); 818 811 if (ret) 819 812 goto free_pending; 820 813 ··· 982 979 if (snap_src) 983 980 error = create_snapshot(snap_src, dir, dentry, readonly, inherit); 984 981 else 985 - error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); 982 + error = create_subvol(mnt_userns, dir, dentry, inherit); 986 983 987 984 if (!error) 988 985 fsnotify_mkdir(dir, dentry); ··· 1416 1413 if (!em) 1417 1414 break; 1418 1415 1419 - /* Skip hole/inline/preallocated extents */ 1420 - if (em->block_start >= EXTENT_MAP_LAST_BYTE || 1416 + /* 1417 + * If the file extent is an inlined one, we may still want to 1418 + * defrag it (fallthrough) if it will cause a regular extent. 1419 + * This is for users who want to convert inline extents to 1420 + * regular ones through max_inline= mount option. 1421 + */ 1422 + if (em->block_start == EXTENT_MAP_INLINE && 1423 + em->len <= inode->root->fs_info->max_inline) 1424 + goto next; 1425 + 1426 + /* Skip hole/delalloc/preallocated extents */ 1427 + if (em->block_start == EXTENT_MAP_HOLE || 1428 + em->block_start == EXTENT_MAP_DELALLOC || 1421 1429 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 1422 1430 goto next; 1423 1431 ··· 1486 1472 */ 1487 1473 if (em->len >= get_extent_max_capacity(em)) 1488 1474 goto next; 1475 + 1476 + /* 1477 + * Normally there are no more extents after an inline one, thus 1478 + * @next_mergeable will normally be false and not defragged. 1479 + * So if an inline extent passed all above checks, just add it 1480 + * for defrag, and be converted to regular extents. 1481 + */ 1482 + if (em->block_start == EXTENT_MAP_INLINE) 1483 + goto add; 1489 1484 1490 1485 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1491 1486 extent_thresh, newer_than, locked); ··· 2617 2594 static noinline int btrfs_ioctl_tree_search(struct inode *inode, 2618 2595 void __user *argp) 2619 2596 { 2620 - struct btrfs_ioctl_search_args __user *uargs; 2597 + struct btrfs_ioctl_search_args __user *uargs = argp; 2621 2598 struct btrfs_ioctl_search_key sk; 2622 2599 int ret; 2623 2600 size_t buf_size; 2624 2601 2625 2602 if (!capable(CAP_SYS_ADMIN)) 2626 2603 return -EPERM; 2627 - 2628 - uargs = (struct btrfs_ioctl_search_args __user *)argp; 2629 2604 2630 2605 if (copy_from_user(&sk, &uargs->key, sizeof(sk))) 2631 2606 return -EFAULT; ··· 2647 2626 static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, 2648 2627 void __user *argp) 2649 2628 { 2650 - struct btrfs_ioctl_search_args_v2 __user *uarg; 2629 + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; 2651 2630 struct btrfs_ioctl_search_args_v2 args; 2652 2631 int ret; 2653 2632 size_t buf_size; ··· 2657 2636 return -EPERM; 2658 2637 2659 2638 /* copy search header and buffer size */ 2660 - uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; 2661 2639 if (copy_from_user(&args, uarg, sizeof(args))) 2662 2640 return -EFAULT; 2663 2641 ··· 4364 4344 bool need_unlock; /* for mut. excl. ops lock */ 4365 4345 int ret; 4366 4346 4367 - if (!arg) 4368 - btrfs_warn(fs_info, 4369 - "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); 4370 - 4371 4347 if (!capable(CAP_SYS_ADMIN)) 4372 4348 return -EPERM; 4373 4349 4374 4350 ret = mnt_want_write_file(file); 4375 4351 if (ret) 4376 4352 return ret; 4353 + 4354 + bargs = memdup_user(arg, sizeof(*bargs)); 4355 + if (IS_ERR(bargs)) { 4356 + ret = PTR_ERR(bargs); 4357 + bargs = NULL; 4358 + goto out; 4359 + } 4377 4360 4378 4361 again: 4379 4362 if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { ··· 4425 4402 } 4426 4403 4427 4404 locked: 4428 - 4429 - if (arg) { 4430 - bargs = memdup_user(arg, sizeof(*bargs)); 4431 - if (IS_ERR(bargs)) { 4432 - ret = PTR_ERR(bargs); 4405 + if (bargs->flags & BTRFS_BALANCE_RESUME) { 4406 + if (!fs_info->balance_ctl) { 4407 + ret = -ENOTCONN; 4433 4408 goto out_unlock; 4434 4409 } 4435 4410 4436 - if (bargs->flags & BTRFS_BALANCE_RESUME) { 4437 - if (!fs_info->balance_ctl) { 4438 - ret = -ENOTCONN; 4439 - goto out_bargs; 4440 - } 4411 + bctl = fs_info->balance_ctl; 4412 + spin_lock(&fs_info->balance_lock); 4413 + bctl->flags |= BTRFS_BALANCE_RESUME; 4414 + spin_unlock(&fs_info->balance_lock); 4415 + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); 4441 4416 4442 - bctl = fs_info->balance_ctl; 4443 - spin_lock(&fs_info->balance_lock); 4444 - bctl->flags |= BTRFS_BALANCE_RESUME; 4445 - spin_unlock(&fs_info->balance_lock); 4446 - btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); 4417 + goto do_balance; 4418 + } 4447 4419 4448 - goto do_balance; 4449 - } 4450 - } else { 4451 - bargs = NULL; 4420 + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { 4421 + ret = -EINVAL; 4422 + goto out_unlock; 4452 4423 } 4453 4424 4454 4425 if (fs_info->balance_ctl) { 4455 4426 ret = -EINPROGRESS; 4456 - goto out_bargs; 4427 + goto out_unlock; 4457 4428 } 4458 4429 4459 4430 bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); 4460 4431 if (!bctl) { 4461 4432 ret = -ENOMEM; 4462 - goto out_bargs; 4433 + goto out_unlock; 4463 4434 } 4464 4435 4465 - if (arg) { 4466 - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); 4467 - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); 4468 - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); 4436 + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); 4437 + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); 4438 + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); 4469 4439 4470 - bctl->flags = bargs->flags; 4471 - } else { 4472 - /* balance everything - no filters */ 4473 - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; 4474 - } 4475 - 4476 - if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { 4477 - ret = -EINVAL; 4478 - goto out_bctl; 4479 - } 4480 - 4440 + bctl->flags = bargs->flags; 4481 4441 do_balance: 4482 4442 /* 4483 4443 * Ownership of bctl and exclusive operation goes to btrfs_balance. ··· 4473 4467 ret = btrfs_balance(fs_info, bctl, bargs); 4474 4468 bctl = NULL; 4475 4469 4476 - if ((ret == 0 || ret == -ECANCELED) && arg) { 4470 + if (ret == 0 || ret == -ECANCELED) { 4477 4471 if (copy_to_user(arg, bargs, sizeof(*bargs))) 4478 4472 ret = -EFAULT; 4479 4473 } 4480 4474 4481 - out_bctl: 4482 4475 kfree(bctl); 4483 - out_bargs: 4484 - kfree(bargs); 4485 4476 out_unlock: 4486 4477 mutex_unlock(&fs_info->balance_mutex); 4487 4478 if (need_unlock) 4488 4479 btrfs_exclop_finish(fs_info); 4489 4480 out: 4490 4481 mnt_drop_write_file(file); 4482 + kfree(bargs); 4491 4483 return ret; 4492 4484 } 4493 4485

+2 -38

fs/btrfs/props.c

··· 380 380 }, 381 381 }; 382 382 383 - static int inherit_props(struct btrfs_trans_handle *trans, 384 - struct inode *inode, 385 - struct inode *parent) 383 + int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, 384 + struct inode *inode, struct inode *parent) 386 385 { 387 386 struct btrfs_root *root = BTRFS_I(inode)->root; 388 387 struct btrfs_fs_info *fs_info = root->fs_info; ··· 454 455 } 455 456 456 457 return 0; 457 - } 458 - 459 - int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, 460 - struct inode *inode, 461 - struct inode *dir) 462 - { 463 - if (!dir) 464 - return 0; 465 - 466 - return inherit_props(trans, inode, dir); 467 - } 468 - 469 - int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, 470 - struct btrfs_root *root, 471 - struct btrfs_root *parent_root) 472 - { 473 - struct super_block *sb = root->fs_info->sb; 474 - struct inode *parent_inode, *child_inode; 475 - int ret; 476 - 477 - parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root); 478 - if (IS_ERR(parent_inode)) 479 - return PTR_ERR(parent_inode); 480 - 481 - child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root); 482 - if (IS_ERR(child_inode)) { 483 - iput(parent_inode); 484 - return PTR_ERR(child_inode); 485 - } 486 - 487 - ret = inherit_props(trans, child_inode, parent_inode); 488 - iput(child_inode); 489 - iput(parent_inode); 490 - 491 - return ret; 492 458 } 493 459 494 460 void __init btrfs_props_init(void)

-4

fs/btrfs/props.h

··· 23 23 struct inode *inode, 24 24 struct inode *dir); 25 25 26 - int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, 27 - struct btrfs_root *root, 28 - struct btrfs_root *parent_root); 29 - 30 26 #endif

+4 -3

fs/btrfs/qgroup.c

··· 2290 2290 return 0; 2291 2291 2292 2292 if (!extent_buffer_uptodate(root_eb)) { 2293 - ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); 2293 + ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); 2294 2294 if (ret) 2295 2295 goto out; 2296 2296 } ··· 3939 3939 } 3940 3940 3941 3941 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3942 - enum btrfs_qgroup_rsv_type type, bool enforce) 3942 + enum btrfs_qgroup_rsv_type type, bool enforce, 3943 + bool noflush) 3943 3944 { 3944 3945 int ret; 3945 3946 3946 3947 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3947 - if (ret <= 0 && ret != -EDQUOT) 3948 + if ((ret <= 0 && ret != -EDQUOT) || noflush) 3948 3949 return ret; 3949 3950 3950 3951 ret = try_flush_qgroup(root);

+8 -4

fs/btrfs/qgroup.h

··· 364 364 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 365 365 enum btrfs_qgroup_rsv_type type, bool enforce); 366 366 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 367 - enum btrfs_qgroup_rsv_type type, bool enforce); 367 + enum btrfs_qgroup_rsv_type type, bool enforce, 368 + bool noflush); 368 369 /* Reserve metadata space for pertrans and prealloc type */ 369 370 static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, 370 371 int num_bytes, bool enforce) 371 372 { 372 373 return __btrfs_qgroup_reserve_meta(root, num_bytes, 373 - BTRFS_QGROUP_RSV_META_PERTRANS, enforce); 374 + BTRFS_QGROUP_RSV_META_PERTRANS, 375 + enforce, false); 374 376 } 375 377 static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, 376 - int num_bytes, bool enforce) 378 + int num_bytes, bool enforce, 379 + bool noflush) 377 380 { 378 381 return __btrfs_qgroup_reserve_meta(root, num_bytes, 379 - BTRFS_QGROUP_RSV_META_PREALLOC, enforce); 382 + BTRFS_QGROUP_RSV_META_PREALLOC, 383 + enforce, noflush); 380 384 } 381 385 382 386 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,

+468 -339

fs/btrfs/raid56.c

··· 52 52 struct btrfs_stripe_hash table[]; 53 53 }; 54 54 55 + /* 56 + * A bvec like structure to present a sector inside a page. 57 + * 58 + * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 59 + */ 60 + struct sector_ptr { 61 + struct page *page; 62 + unsigned int pgoff:24; 63 + unsigned int uptodate:8; 64 + }; 65 + 55 66 enum btrfs_rbio_ops { 56 67 BTRFS_RBIO_WRITE, 57 68 BTRFS_RBIO_READ_REBUILD, ··· 88 77 /* 89 78 * for scheduling work in the helper threads 90 79 */ 91 - struct btrfs_work work; 80 + struct work_struct work; 92 81 93 82 /* 94 83 * bio list and bio_list_lock are used ··· 112 101 */ 113 102 unsigned long flags; 114 103 115 - /* size of each individual stripe on disk */ 116 - int stripe_len; 117 - 118 - /* number of data stripes (no p/q) */ 119 - int nr_data; 120 - 121 - int real_stripes; 122 - 123 - int stripe_npages; 124 104 /* 125 105 * set if we're doing a parity rebuild 126 106 * for a read from higher up, which is handled ··· 120 118 */ 121 119 enum btrfs_rbio_ops operation; 122 120 123 - /* first bad stripe */ 124 - int faila; 121 + /* Size of each individual stripe on disk */ 122 + u32 stripe_len; 125 123 126 - /* second bad stripe (for raid6 use) */ 127 - int failb; 124 + /* How many pages there are for the full stripe including P/Q */ 125 + u16 nr_pages; 128 126 129 - int scrubp; 130 - /* 131 - * number of pages needed to represent the full 132 - * stripe 133 - */ 134 - int nr_pages; 127 + /* How many sectors there are for the full stripe including P/Q */ 128 + u16 nr_sectors; 129 + 130 + /* Number of data stripes (no p/q) */ 131 + u8 nr_data; 132 + 133 + /* Numer of all stripes (including P/Q) */ 134 + u8 real_stripes; 135 + 136 + /* How many pages there are for each stripe */ 137 + u8 stripe_npages; 138 + 139 + /* How many sectors there are for each stripe */ 140 + u8 stripe_nsectors; 141 + 142 + /* First bad stripe, -1 means no corruption */ 143 + s8 faila; 144 + 145 + /* Second bad stripe (for RAID6 use) */ 146 + s8 failb; 147 + 148 + /* Stripe number that we're scrubbing */ 149 + u8 scrubp; 135 150 136 151 /* 137 152 * size of all the bios in the bio_list. This ··· 175 156 */ 176 157 struct page **stripe_pages; 177 158 178 - /* 179 - * pointers to the pages in the bio_list. Stored 180 - * here for faster lookup 181 - */ 182 - struct page **bio_pages; 159 + /* Pointers to the sectors in the bio_list, for faster lookup */ 160 + struct sector_ptr *bio_sectors; 183 161 184 162 /* 185 - * bitmap to record which horizontal stripe has data 163 + * For subpage support, we need to map each sector to above 164 + * stripe_pages. 186 165 */ 166 + struct sector_ptr *stripe_sectors; 167 + 168 + /* Bitmap to record which horizontal stripe has data */ 187 169 unsigned long *dbitmap; 188 170 189 171 /* allocated with real_stripes-many pointers for finish_*() calls */ 190 172 void **finish_pointers; 191 173 192 - /* allocated with stripe_npages-many bits for finish_*() calls */ 174 + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ 193 175 unsigned long *finish_pbitmap; 194 176 }; 195 177 196 178 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 197 179 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 198 - static void rmw_work(struct btrfs_work *work); 199 - static void read_rebuild_work(struct btrfs_work *work); 180 + static void rmw_work(struct work_struct *work); 181 + static void read_rebuild_work(struct work_struct *work); 200 182 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 201 183 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 202 184 static void __free_raid_bio(struct btrfs_raid_bio *rbio); ··· 206 186 207 187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 208 188 int need_check); 209 - static void scrub_parity_work(struct btrfs_work *work); 189 + static void scrub_parity_work(struct work_struct *work); 210 190 211 - static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 191 + static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 212 192 { 213 - btrfs_init_work(&rbio->work, work_func, NULL, NULL); 214 - btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 193 + INIT_WORK(&rbio->work, work_func); 194 + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 215 195 } 216 196 217 197 /* ··· 259 239 260 240 /* 261 241 * caching an rbio means to copy anything from the 262 - * bio_pages array into the stripe_pages array. We 242 + * bio_sectors array into the stripe_pages array. We 263 243 * use the page uptodate bit in the stripe cache array 264 244 * to indicate if it has valid data 265 245 * ··· 275 255 if (ret) 276 256 return; 277 257 278 - for (i = 0; i < rbio->nr_pages; i++) { 279 - if (!rbio->bio_pages[i]) 258 + for (i = 0; i < rbio->nr_sectors; i++) { 259 + /* Some range not covered by bio (partial write), skip it */ 260 + if (!rbio->bio_sectors[i].page) 280 261 continue; 281 262 282 - copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); 283 - SetPageUptodate(rbio->stripe_pages[i]); 263 + ASSERT(rbio->stripe_sectors[i].page); 264 + memcpy_page(rbio->stripe_sectors[i].page, 265 + rbio->stripe_sectors[i].pgoff, 266 + rbio->bio_sectors[i].page, 267 + rbio->bio_sectors[i].pgoff, 268 + rbio->bioc->fs_info->sectorsize); 269 + rbio->stripe_sectors[i].uptodate = 1; 284 270 } 285 271 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 286 272 } ··· 309 283 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 310 284 } 311 285 286 + static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 287 + unsigned int page_nr) 288 + { 289 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 290 + const u32 sectors_per_page = PAGE_SIZE / sectorsize; 291 + int i; 292 + 293 + ASSERT(page_nr < rbio->nr_pages); 294 + 295 + for (i = sectors_per_page * page_nr; 296 + i < sectors_per_page * page_nr + sectors_per_page; 297 + i++) { 298 + if (!rbio->stripe_sectors[i].uptodate) 299 + return false; 300 + } 301 + return true; 302 + } 303 + 312 304 /* 313 - * stealing an rbio means taking all the uptodate pages from the stripe 314 - * array in the source rbio and putting them into the destination rbio 305 + * Update the stripe_sectors[] array to use correct page and pgoff 306 + * 307 + * Should be called every time any page pointer in stripes_pages[] got modified. 308 + */ 309 + static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 310 + { 311 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 312 + u32 offset; 313 + int i; 314 + 315 + for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 316 + int page_index = offset >> PAGE_SHIFT; 317 + 318 + ASSERT(page_index < rbio->nr_pages); 319 + rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 320 + rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 321 + } 322 + } 323 + 324 + /* 325 + * Stealing an rbio means taking all the uptodate pages from the stripe array 326 + * in the source rbio and putting them into the destination rbio. 327 + * 328 + * This will also update the involved stripe_sectors[] which are referring to 329 + * the old pages. 315 330 */ 316 331 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 317 332 { ··· 365 298 366 299 for (i = 0; i < dest->nr_pages; i++) { 367 300 s = src->stripe_pages[i]; 368 - if (!s || !PageUptodate(s)) { 301 + if (!s || !full_page_sectors_uptodate(src, i)) 369 302 continue; 370 - } 371 303 372 304 d = dest->stripe_pages[i]; 373 305 if (d) ··· 375 309 dest->stripe_pages[i] = s; 376 310 src->stripe_pages[i] = NULL; 377 311 } 312 + index_stripe_sectors(dest); 313 + index_stripe_sectors(src); 378 314 } 379 315 380 316 /* ··· 668 600 return 1; 669 601 } 670 602 671 - static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 672 - int index) 603 + static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 604 + unsigned int stripe_nr, 605 + unsigned int sector_nr) 673 606 { 674 - return stripe * rbio->stripe_npages + index; 607 + ASSERT(stripe_nr < rbio->real_stripes); 608 + ASSERT(sector_nr < rbio->stripe_nsectors); 609 + 610 + return stripe_nr * rbio->stripe_nsectors + sector_nr; 675 611 } 676 612 677 - /* 678 - * these are just the pages from the rbio array, not from anything 679 - * the FS sent down to us 680 - */ 681 - static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 682 - int index) 613 + /* Return a sector from rbio->stripe_sectors, not from the bio list */ 614 + static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 615 + unsigned int stripe_nr, 616 + unsigned int sector_nr) 683 617 { 684 - return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 618 + return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 619 + sector_nr)]; 685 620 } 686 621 687 - /* 688 - * helper to index into the pstripe 689 - */ 690 - static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 622 + /* Grab a sector inside P stripe */ 623 + static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 624 + unsigned int sector_nr) 691 625 { 692 - return rbio_stripe_page(rbio, rbio->nr_data, index); 626 + return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 693 627 } 694 628 695 - /* 696 - * helper to index into the qstripe, returns null 697 - * if there is no qstripe 698 - */ 699 - static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 629 + /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 630 + static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 631 + unsigned int sector_nr) 700 632 { 701 633 if (rbio->nr_data + 1 == rbio->real_stripes) 702 634 return NULL; 703 - return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 635 + return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 704 636 } 705 637 706 638 /* ··· 979 911 rbio_orig_end_io(rbio, err); 980 912 } 981 913 982 - /* 983 - * the read/modify/write code wants to use the original bio for 984 - * any pages it included, and then use the rbio for everything 985 - * else. This function decides if a given index (stripe number) 986 - * and page number in that stripe fall inside the original bio 987 - * or the rbio. 914 + /** 915 + * Get a sector pointer specified by its @stripe_nr and @sector_nr 988 916 * 989 - * if you set bio_list_only, you'll get a NULL back for any ranges 990 - * that are outside the bio_list 917 + * @rbio: The raid bio 918 + * @stripe_nr: Stripe number, valid range [0, real_stripe) 919 + * @sector_nr: Sector number inside the stripe, 920 + * valid range [0, stripe_nsectors) 921 + * @bio_list_only: Whether to use sectors inside the bio list only. 991 922 * 992 - * This doesn't take any refs on anything, you get a bare page pointer 993 - * and the caller must bump refs as required. 994 - * 995 - * You must call index_rbio_pages once before you can trust 996 - * the answers from this function. 923 + * The read/modify/write code wants to reuse the original bio page as much 924 + * as possible, and only use stripe_sectors as fallback. 997 925 */ 998 - static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 999 - int index, int pagenr, int bio_list_only) 926 + static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 927 + int stripe_nr, int sector_nr, 928 + bool bio_list_only) 1000 929 { 1001 - int chunk_page; 1002 - struct page *p = NULL; 930 + struct sector_ptr *sector; 931 + int index; 1003 932 1004 - chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 933 + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 934 + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 935 + 936 + index = stripe_nr * rbio->stripe_nsectors + sector_nr; 937 + ASSERT(index >= 0 && index < rbio->nr_sectors); 1005 938 1006 939 spin_lock_irq(&rbio->bio_list_lock); 1007 - p = rbio->bio_pages[chunk_page]; 940 + sector = &rbio->bio_sectors[index]; 941 + if (sector->page || bio_list_only) { 942 + /* Don't return sector without a valid page pointer */ 943 + if (!sector->page) 944 + sector = NULL; 945 + spin_unlock_irq(&rbio->bio_list_lock); 946 + return sector; 947 + } 1008 948 spin_unlock_irq(&rbio->bio_list_lock); 1009 949 1010 - if (p || bio_list_only) 1011 - return p; 1012 - 1013 - return rbio->stripe_pages[chunk_page]; 1014 - } 1015 - 1016 - /* 1017 - * number of pages we need for the entire stripe across all the 1018 - * drives 1019 - */ 1020 - static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 1021 - { 1022 - return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 950 + return &rbio->stripe_sectors[index]; 1023 951 } 1024 952 1025 953 /* ··· 1024 960 */ 1025 961 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 1026 962 struct btrfs_io_context *bioc, 1027 - u64 stripe_len) 963 + u32 stripe_len) 1028 964 { 965 + const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 966 + const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; 967 + const unsigned int num_pages = stripe_npages * real_stripes; 968 + const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; 969 + const unsigned int num_sectors = stripe_nsectors * real_stripes; 1029 970 struct btrfs_raid_bio *rbio; 1030 971 int nr_data = 0; 1031 - int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 1032 - int num_pages = rbio_nr_pages(stripe_len, real_stripes); 1033 - int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 1034 972 void *p; 973 + 974 + ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); 975 + /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 976 + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 1035 977 1036 978 rbio = kzalloc(sizeof(*rbio) + 1037 979 sizeof(*rbio->stripe_pages) * num_pages + 1038 - sizeof(*rbio->bio_pages) * num_pages + 980 + sizeof(*rbio->bio_sectors) * num_sectors + 981 + sizeof(*rbio->stripe_sectors) * num_sectors + 1039 982 sizeof(*rbio->finish_pointers) * real_stripes + 1040 - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 1041 - sizeof(*rbio->finish_pbitmap) * 1042 - BITS_TO_LONGS(stripe_npages), 983 + sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + 984 + sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), 1043 985 GFP_NOFS); 1044 986 if (!rbio) 1045 987 return ERR_PTR(-ENOMEM); ··· 1058 988 rbio->bioc = bioc; 1059 989 rbio->stripe_len = stripe_len; 1060 990 rbio->nr_pages = num_pages; 991 + rbio->nr_sectors = num_sectors; 1061 992 rbio->real_stripes = real_stripes; 1062 993 rbio->stripe_npages = stripe_npages; 994 + rbio->stripe_nsectors = stripe_nsectors; 1063 995 rbio->faila = -1; 1064 996 rbio->failb = -1; 1065 997 refcount_set(&rbio->refs, 1); ··· 1069 997 atomic_set(&rbio->stripes_pending, 0); 1070 998 1071 999 /* 1072 - * the stripe_pages, bio_pages, etc arrays point to the extra 1073 - * memory we allocated past the end of the rbio 1000 + * The stripe_pages, bio_sectors, etc arrays point to the extra memory 1001 + * we allocated past the end of the rbio. 1074 1002 */ 1075 1003 p = rbio + 1; 1076 1004 #define CONSUME_ALLOC(ptr, count) do { \ ··· 1078 1006 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1079 1007 } while (0) 1080 1008 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1081 - CONSUME_ALLOC(rbio->bio_pages, num_pages); 1009 + CONSUME_ALLOC(rbio->bio_sectors, num_sectors); 1010 + CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); 1082 1011 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1083 - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1084 - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1012 + CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); 1013 + CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); 1085 1014 #undef CONSUME_ALLOC 1086 1015 1087 1016 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) ··· 1099 1026 /* allocate pages for all the stripes in the bio, including parity */ 1100 1027 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1101 1028 { 1102 - int i; 1103 - struct page *page; 1029 + int ret; 1104 1030 1105 - for (i = 0; i < rbio->nr_pages; i++) { 1106 - if (rbio->stripe_pages[i]) 1107 - continue; 1108 - page = alloc_page(GFP_NOFS); 1109 - if (!page) 1110 - return -ENOMEM; 1111 - rbio->stripe_pages[i] = page; 1112 - } 1031 + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 1032 + if (ret < 0) 1033 + return ret; 1034 + /* Mapping all sectors */ 1035 + index_stripe_sectors(rbio); 1113 1036 return 0; 1114 1037 } 1115 1038 1116 1039 /* only allocate pages for p/q stripes */ 1117 1040 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1118 1041 { 1119 - int i; 1120 - struct page *page; 1042 + const int data_pages = rbio->nr_data * rbio->stripe_npages; 1043 + int ret; 1121 1044 1122 - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1045 + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1046 + rbio->stripe_pages + data_pages); 1047 + if (ret < 0) 1048 + return ret; 1123 1049 1124 - for (; i < rbio->nr_pages; i++) { 1125 - if (rbio->stripe_pages[i]) 1126 - continue; 1127 - page = alloc_page(GFP_NOFS); 1128 - if (!page) 1129 - return -ENOMEM; 1130 - rbio->stripe_pages[i] = page; 1131 - } 1050 + index_stripe_sectors(rbio); 1132 1051 return 0; 1133 1052 } 1134 1053 1135 1054 /* 1136 - * add a single page from a specific stripe into our list of bios for IO 1137 - * this will try to merge into existing bios if possible, and returns 1138 - * zero if all went well. 1055 + * Add a single sector @sector into our list of bios for IO. 1056 + * 1057 + * Return 0 if everything went well. 1058 + * Return <0 for error. 1139 1059 */ 1140 - static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1141 - struct bio_list *bio_list, 1142 - struct page *page, 1143 - int stripe_nr, 1144 - unsigned long page_index, 1145 - unsigned long bio_max_len) 1060 + static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1061 + struct bio_list *bio_list, 1062 + struct sector_ptr *sector, 1063 + unsigned int stripe_nr, 1064 + unsigned int sector_nr, 1065 + unsigned long bio_max_len, 1066 + unsigned int opf) 1146 1067 { 1068 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1147 1069 struct bio *last = bio_list->tail; 1148 1070 int ret; 1149 1071 struct bio *bio; 1150 1072 struct btrfs_io_stripe *stripe; 1151 1073 u64 disk_start; 1152 1074 1075 + /* 1076 + * Note: here stripe_nr has taken device replace into consideration, 1077 + * thus it can be larger than rbio->real_stripe. 1078 + * So here we check against bioc->num_stripes, not rbio->real_stripes. 1079 + */ 1080 + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 1081 + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1082 + ASSERT(sector->page); 1083 + 1153 1084 stripe = &rbio->bioc->stripes[stripe_nr]; 1154 - disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1085 + disk_start = stripe->physical + sector_nr * sectorsize; 1155 1086 1156 1087 /* if the device is missing, just fail this stripe */ 1157 1088 if (!stripe->dev->bdev) ··· 1172 1095 */ 1173 1096 if (last_end == disk_start && !last->bi_status && 1174 1097 last->bi_bdev == stripe->dev->bdev) { 1175 - ret = bio_add_page(last, page, PAGE_SIZE, 0); 1176 - if (ret == PAGE_SIZE) 1098 + ret = bio_add_page(last, sector->page, sectorsize, 1099 + sector->pgoff); 1100 + if (ret == sectorsize) 1177 1101 return 0; 1178 1102 } 1179 1103 } 1180 1104 1181 1105 /* put a new bio on the list */ 1182 - bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1183 - btrfs_bio(bio)->device = stripe->dev; 1184 - bio->bi_iter.bi_size = 0; 1185 - bio_set_dev(bio, stripe->dev->bdev); 1106 + bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), 1107 + opf, GFP_NOFS); 1186 1108 bio->bi_iter.bi_sector = disk_start >> 9; 1109 + bio->bi_private = rbio; 1187 1110 1188 - bio_add_page(bio, page, PAGE_SIZE, 0); 1111 + bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 1189 1112 bio_list_add(bio_list, bio); 1190 1113 return 0; 1191 1114 } ··· 1207 1130 } 1208 1131 } 1209 1132 1133 + static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1134 + { 1135 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1136 + struct bio_vec bvec; 1137 + struct bvec_iter iter; 1138 + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1139 + rbio->bioc->raid_map[0]; 1140 + 1141 + if (bio_flagged(bio, BIO_CLONED)) 1142 + bio->bi_iter = btrfs_bio(bio)->iter; 1143 + 1144 + bio_for_each_segment(bvec, bio, iter) { 1145 + u32 bvec_offset; 1146 + 1147 + for (bvec_offset = 0; bvec_offset < bvec.bv_len; 1148 + bvec_offset += sectorsize, offset += sectorsize) { 1149 + int index = offset / sectorsize; 1150 + struct sector_ptr *sector = &rbio->bio_sectors[index]; 1151 + 1152 + sector->page = bvec.bv_page; 1153 + sector->pgoff = bvec.bv_offset + bvec_offset; 1154 + ASSERT(sector->pgoff < PAGE_SIZE); 1155 + } 1156 + } 1157 + } 1158 + 1210 1159 /* 1211 1160 * helper function to walk our bio list and populate the bio_pages array with 1212 1161 * the result. This seems expensive, but it is faster than constantly ··· 1244 1141 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1245 1142 { 1246 1143 struct bio *bio; 1247 - u64 start; 1248 - unsigned long stripe_offset; 1249 - unsigned long page_index; 1250 1144 1251 1145 spin_lock_irq(&rbio->bio_list_lock); 1252 - bio_list_for_each(bio, &rbio->bio_list) { 1253 - struct bio_vec bvec; 1254 - struct bvec_iter iter; 1255 - int i = 0; 1146 + bio_list_for_each(bio, &rbio->bio_list) 1147 + index_one_bio(rbio, bio); 1256 1148 1257 - start = bio->bi_iter.bi_sector << 9; 1258 - stripe_offset = start - rbio->bioc->raid_map[0]; 1259 - page_index = stripe_offset >> PAGE_SHIFT; 1260 - 1261 - if (bio_flagged(bio, BIO_CLONED)) 1262 - bio->bi_iter = btrfs_bio(bio)->iter; 1263 - 1264 - bio_for_each_segment(bvec, bio, iter) { 1265 - rbio->bio_pages[page_index + i] = bvec.bv_page; 1266 - i++; 1267 - } 1268 - } 1269 1149 spin_unlock_irq(&rbio->bio_list_lock); 1270 1150 } 1271 1151 ··· 1263 1177 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1264 1178 { 1265 1179 struct btrfs_io_context *bioc = rbio->bioc; 1180 + const u32 sectorsize = bioc->fs_info->sectorsize; 1266 1181 void **pointers = rbio->finish_pointers; 1267 1182 int nr_data = rbio->nr_data; 1268 1183 int stripe; 1269 - int pagenr; 1184 + int sectornr; 1270 1185 bool has_qstripe; 1271 1186 struct bio_list bio_list; 1272 1187 struct bio *bio; ··· 1311 1224 else 1312 1225 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1313 1226 1314 - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1315 - struct page *p; 1316 - /* first collect one page from each data stripe */ 1227 + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1228 + struct sector_ptr *sector; 1229 + 1230 + /* First collect one sector from each data stripe */ 1317 1231 for (stripe = 0; stripe < nr_data; stripe++) { 1318 - p = page_in_rbio(rbio, stripe, pagenr, 0); 1319 - pointers[stripe] = kmap_local_page(p); 1232 + sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1233 + pointers[stripe] = kmap_local_page(sector->page) + 1234 + sector->pgoff; 1320 1235 } 1321 1236 1322 - /* then add the parity stripe */ 1323 - p = rbio_pstripe_page(rbio, pagenr); 1324 - SetPageUptodate(p); 1325 - pointers[stripe++] = kmap_local_page(p); 1237 + /* Then add the parity stripe */ 1238 + sector = rbio_pstripe_sector(rbio, sectornr); 1239 + sector->uptodate = 1; 1240 + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 1326 1241 1327 1242 if (has_qstripe) { 1328 - 1329 1243 /* 1330 - * raid6, add the qstripe and call the 1331 - * library function to fill in our p/q 1244 + * RAID6, add the qstripe and call the library function 1245 + * to fill in our p/q 1332 1246 */ 1333 - p = rbio_qstripe_page(rbio, pagenr); 1334 - SetPageUptodate(p); 1335 - pointers[stripe++] = kmap_local_page(p); 1247 + sector = rbio_qstripe_sector(rbio, sectornr); 1248 + sector->uptodate = 1; 1249 + pointers[stripe++] = kmap_local_page(sector->page) + 1250 + sector->pgoff; 1336 1251 1337 - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1252 + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1338 1253 pointers); 1339 1254 } else { 1340 1255 /* raid5 */ 1341 - copy_page(pointers[nr_data], pointers[0]); 1342 - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1256 + memcpy(pointers[nr_data], pointers[0], sectorsize); 1257 + run_xor(pointers + 1, nr_data - 1, sectorsize); 1343 1258 } 1344 1259 for (stripe = stripe - 1; stripe >= 0; stripe--) 1345 1260 kunmap_local(pointers[stripe]); ··· 1353 1264 * everything else. 1354 1265 */ 1355 1266 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1356 - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1357 - struct page *page; 1267 + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1268 + struct sector_ptr *sector; 1269 + 1358 1270 if (stripe < rbio->nr_data) { 1359 - page = page_in_rbio(rbio, stripe, pagenr, 1); 1360 - if (!page) 1271 + sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1272 + if (!sector) 1361 1273 continue; 1362 1274 } else { 1363 - page = rbio_stripe_page(rbio, stripe, pagenr); 1275 + sector = rbio_stripe_sector(rbio, stripe, sectornr); 1364 1276 } 1365 1277 1366 - ret = rbio_add_io_page(rbio, &bio_list, 1367 - page, stripe, pagenr, rbio->stripe_len); 1278 + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1279 + sectornr, rbio->stripe_len, 1280 + REQ_OP_WRITE); 1368 1281 if (ret) 1369 1282 goto cleanup; 1370 1283 } ··· 1379 1288 if (!bioc->tgtdev_map[stripe]) 1380 1289 continue; 1381 1290 1382 - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1383 - struct page *page; 1291 + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1292 + struct sector_ptr *sector; 1293 + 1384 1294 if (stripe < rbio->nr_data) { 1385 - page = page_in_rbio(rbio, stripe, pagenr, 1); 1386 - if (!page) 1295 + sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1296 + if (!sector) 1387 1297 continue; 1388 1298 } else { 1389 - page = rbio_stripe_page(rbio, stripe, pagenr); 1299 + sector = rbio_stripe_sector(rbio, stripe, sectornr); 1390 1300 } 1391 1301 1392 - ret = rbio_add_io_page(rbio, &bio_list, page, 1302 + ret = rbio_add_io_sector(rbio, &bio_list, sector, 1393 1303 rbio->bioc->tgtdev_map[stripe], 1394 - pagenr, rbio->stripe_len); 1304 + sectornr, rbio->stripe_len, 1305 + REQ_OP_WRITE); 1395 1306 if (ret) 1396 1307 goto cleanup; 1397 1308 } ··· 1404 1311 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1405 1312 1406 1313 while ((bio = bio_list_pop(&bio_list))) { 1407 - bio->bi_private = rbio; 1408 1314 bio->bi_end_io = raid_write_end_io; 1409 - bio->bi_opf = REQ_OP_WRITE; 1410 1315 1411 1316 submit_bio(bio); 1412 1317 } ··· 1508 1417 } 1509 1418 1510 1419 /* 1420 + * For subpage case, we can no longer set page Uptodate directly for 1421 + * stripe_pages[], thus we need to locate the sector. 1422 + */ 1423 + static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1424 + struct page *page, 1425 + unsigned int pgoff) 1426 + { 1427 + int i; 1428 + 1429 + for (i = 0; i < rbio->nr_sectors; i++) { 1430 + struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1431 + 1432 + if (sector->page == page && sector->pgoff == pgoff) 1433 + return sector; 1434 + } 1435 + return NULL; 1436 + } 1437 + 1438 + /* 1511 1439 * this sets each page in the bio uptodate. It should only be used on private 1512 1440 * rbio pages, nothing that comes in from the higher layers 1513 1441 */ 1514 - static void set_bio_pages_uptodate(struct bio *bio) 1442 + static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1515 1443 { 1444 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1516 1445 struct bio_vec *bvec; 1517 1446 struct bvec_iter_all iter_all; 1518 1447 1519 1448 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1520 1449 1521 - bio_for_each_segment_all(bvec, bio, iter_all) 1522 - SetPageUptodate(bvec->bv_page); 1450 + bio_for_each_segment_all(bvec, bio, iter_all) { 1451 + struct sector_ptr *sector; 1452 + int pgoff; 1453 + 1454 + for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 1455 + pgoff += sectorsize) { 1456 + sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 1457 + ASSERT(sector); 1458 + if (sector) 1459 + sector->uptodate = 1; 1460 + } 1461 + } 1523 1462 } 1524 1463 1525 1464 /* ··· 1567 1446 if (bio->bi_status) 1568 1447 fail_bio_stripe(rbio, bio); 1569 1448 else 1570 - set_bio_pages_uptodate(bio); 1449 + set_bio_pages_uptodate(rbio, bio); 1571 1450 1572 1451 bio_put(bio); 1573 1452 ··· 1599 1478 int bios_to_read = 0; 1600 1479 struct bio_list bio_list; 1601 1480 int ret; 1602 - int pagenr; 1481 + int sectornr; 1603 1482 int stripe; 1604 1483 struct bio *bio; 1605 1484 ··· 1617 1496 * stripe 1618 1497 */ 1619 1498 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1620 - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1621 - struct page *page; 1499 + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1500 + struct sector_ptr *sector; 1501 + 1622 1502 /* 1623 - * we want to find all the pages missing from 1624 - * the rbio and read them from the disk. If 1625 - * page_in_rbio finds a page in the bio list 1626 - * we don't need to read it off the stripe. 1503 + * We want to find all the sectors missing from the 1504 + * rbio and read them from the disk. If * sector_in_rbio() 1505 + * finds a page in the bio list we don't need to read 1506 + * it off the stripe. 1627 1507 */ 1628 - page = page_in_rbio(rbio, stripe, pagenr, 1); 1629 - if (page) 1508 + sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1509 + if (sector) 1630 1510 continue; 1631 1511 1632 - page = rbio_stripe_page(rbio, stripe, pagenr); 1512 + sector = rbio_stripe_sector(rbio, stripe, sectornr); 1633 1513 /* 1634 - * the bio cache may have handed us an uptodate 1635 - * page. If so, be happy and use it 1514 + * The bio cache may have handed us an uptodate page. 1515 + * If so, be happy and use it. 1636 1516 */ 1637 - if (PageUptodate(page)) 1517 + if (sector->uptodate) 1638 1518 continue; 1639 1519 1640 - ret = rbio_add_io_page(rbio, &bio_list, page, 1641 - stripe, pagenr, rbio->stripe_len); 1520 + ret = rbio_add_io_sector(rbio, &bio_list, sector, 1521 + stripe, sectornr, rbio->stripe_len, 1522 + REQ_OP_READ); 1642 1523 if (ret) 1643 1524 goto cleanup; 1644 1525 } ··· 1663 1540 */ 1664 1541 atomic_set(&rbio->stripes_pending, bios_to_read); 1665 1542 while ((bio = bio_list_pop(&bio_list))) { 1666 - bio->bi_private = rbio; 1667 1543 bio->bi_end_io = raid_rmw_end_io; 1668 - bio->bi_opf = REQ_OP_READ; 1669 1544 1670 1545 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1671 1546 ··· 1745 1624 struct blk_plug_cb cb; 1746 1625 struct btrfs_fs_info *info; 1747 1626 struct list_head rbio_list; 1748 - struct btrfs_work work; 1627 + struct work_struct work; 1749 1628 }; 1750 1629 1751 1630 /* ··· 1813 1692 * if the unplug comes from schedule, we have to push the 1814 1693 * work off to a helper thread 1815 1694 */ 1816 - static void unplug_work(struct btrfs_work *work) 1695 + static void unplug_work(struct work_struct *work) 1817 1696 { 1818 1697 struct btrfs_plug_cb *plug; 1819 1698 plug = container_of(work, struct btrfs_plug_cb, work); ··· 1826 1705 plug = container_of(cb, struct btrfs_plug_cb, cb); 1827 1706 1828 1707 if (from_schedule) { 1829 - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1830 - btrfs_queue_work(plug->info->rmw_workers, 1831 - &plug->work); 1708 + INIT_WORK(&plug->work, unplug_work); 1709 + queue_work(plug->info->rmw_workers, &plug->work); 1832 1710 return; 1833 1711 } 1834 1712 run_plug(plug); ··· 1836 1716 /* 1837 1717 * our main entry point for writes from the rest of the FS. 1838 1718 */ 1839 - int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, 1840 - u64 stripe_len) 1719 + int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) 1841 1720 { 1842 1721 struct btrfs_fs_info *fs_info = bioc->fs_info; 1843 1722 struct btrfs_raid_bio *rbio; ··· 1891 1772 */ 1892 1773 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1893 1774 { 1894 - int pagenr, stripe; 1775 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1776 + int sectornr, stripe; 1895 1777 void **pointers; 1896 1778 void **unmap_array; 1897 1779 int faila = -1, failb = -1; 1898 - struct page *page; 1899 1780 blk_status_t err; 1900 1781 int i; 1901 1782 1783 + /* 1784 + * This array stores the pointer for each sector, thus it has the extra 1785 + * pgoff value added from each sector 1786 + */ 1902 1787 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1903 1788 if (!pointers) { 1904 1789 err = BLK_STS_RESOURCE; ··· 1931 1808 1932 1809 index_rbio_pages(rbio); 1933 1810 1934 - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1811 + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1812 + struct sector_ptr *sector; 1813 + 1935 1814 /* 1936 1815 * Now we just use bitmap to mark the horizontal stripes in 1937 1816 * which we have data when doing parity scrub. 1938 1817 */ 1939 1818 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1940 - !test_bit(pagenr, rbio->dbitmap)) 1819 + !test_bit(sectornr, rbio->dbitmap)) 1941 1820 continue; 1942 1821 1943 1822 /* 1944 - * Setup our array of pointers with pages from each stripe 1823 + * Setup our array of pointers with sectors from each stripe 1945 1824 * 1946 1825 * NOTE: store a duplicate array of pointers to preserve the 1947 1826 * pointer order 1948 1827 */ 1949 1828 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1950 1829 /* 1951 - * if we're rebuilding a read, we have to use 1830 + * If we're rebuilding a read, we have to use 1952 1831 * pages from the bio list 1953 1832 */ 1954 1833 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1955 1834 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1956 1835 (stripe == faila || stripe == failb)) { 1957 - page = page_in_rbio(rbio, stripe, pagenr, 0); 1836 + sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1958 1837 } else { 1959 - page = rbio_stripe_page(rbio, stripe, pagenr); 1838 + sector = rbio_stripe_sector(rbio, stripe, sectornr); 1960 1839 } 1961 - pointers[stripe] = kmap_local_page(page); 1840 + ASSERT(sector->page); 1841 + pointers[stripe] = kmap_local_page(sector->page) + 1842 + sector->pgoff; 1962 1843 unmap_array[stripe] = pointers[stripe]; 1963 1844 } 1964 1845 1965 - /* all raid6 handling here */ 1846 + /* All raid6 handling here */ 1966 1847 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1967 - /* 1968 - * single failure, rebuild from parity raid5 1969 - * style 1970 - */ 1848 + /* Single failure, rebuild from parity raid5 style */ 1971 1849 if (failb < 0) { 1972 1850 if (faila == rbio->nr_data) { 1973 1851 /* ··· 2011 1887 2012 1888 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 2013 1889 raid6_datap_recov(rbio->real_stripes, 2014 - PAGE_SIZE, faila, pointers); 1890 + sectorsize, faila, pointers); 2015 1891 } else { 2016 1892 raid6_2data_recov(rbio->real_stripes, 2017 - PAGE_SIZE, faila, failb, 1893 + sectorsize, faila, failb, 2018 1894 pointers); 2019 1895 } 2020 1896 } else { ··· 2024 1900 BUG_ON(failb != -1); 2025 1901 pstripe: 2026 1902 /* Copy parity block into failed block to start with */ 2027 - copy_page(pointers[faila], pointers[rbio->nr_data]); 1903 + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 2028 1904 2029 1905 /* rearrange the pointer array */ 2030 1906 p = pointers[faila]; ··· 2033 1909 pointers[rbio->nr_data - 1] = p; 2034 1910 2035 1911 /* xor in the rest */ 2036 - run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1912 + run_xor(pointers, rbio->nr_data - 1, sectorsize); 2037 1913 } 2038 1914 /* if we're doing this rebuild as part of an rmw, go through 2039 1915 * and set all of our private rbio pages in the ··· 2042 1918 * other endio functions will fiddle the uptodate bits 2043 1919 */ 2044 1920 if (rbio->operation == BTRFS_RBIO_WRITE) { 2045 - for (i = 0; i < rbio->stripe_npages; i++) { 1921 + for (i = 0; i < rbio->stripe_nsectors; i++) { 2046 1922 if (faila != -1) { 2047 - page = rbio_stripe_page(rbio, faila, i); 2048 - SetPageUptodate(page); 1923 + sector = rbio_stripe_sector(rbio, faila, i); 1924 + sector->uptodate = 1; 2049 1925 } 2050 1926 if (failb != -1) { 2051 - page = rbio_stripe_page(rbio, failb, i); 2052 - SetPageUptodate(page); 1927 + sector = rbio_stripe_sector(rbio, failb, i); 1928 + sector->uptodate = 1; 2053 1929 } 2054 1930 } 2055 1931 } ··· 2122 1998 if (bio->bi_status) 2123 1999 fail_bio_stripe(rbio, bio); 2124 2000 else 2125 - set_bio_pages_uptodate(bio); 2001 + set_bio_pages_uptodate(rbio, bio); 2126 2002 bio_put(bio); 2127 2003 2128 2004 if (!atomic_dec_and_test(&rbio->stripes_pending)) ··· 2147 2023 int bios_to_read = 0; 2148 2024 struct bio_list bio_list; 2149 2025 int ret; 2150 - int pagenr; 2026 + int sectornr; 2151 2027 int stripe; 2152 2028 struct bio *bio; 2153 2029 ··· 2170 2046 continue; 2171 2047 } 2172 2048 2173 - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2174 - struct page *p; 2049 + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2050 + struct sector_ptr *sector; 2175 2051 2176 2052 /* 2177 2053 * the rmw code may have already read this 2178 2054 * page in 2179 2055 */ 2180 - p = rbio_stripe_page(rbio, stripe, pagenr); 2181 - if (PageUptodate(p)) 2056 + sector = rbio_stripe_sector(rbio, stripe, sectornr); 2057 + if (sector->uptodate) 2182 2058 continue; 2183 2059 2184 - ret = rbio_add_io_page(rbio, &bio_list, 2185 - rbio_stripe_page(rbio, stripe, pagenr), 2186 - stripe, pagenr, rbio->stripe_len); 2060 + ret = rbio_add_io_sector(rbio, &bio_list, sector, 2061 + stripe, sectornr, rbio->stripe_len, 2062 + REQ_OP_READ); 2187 2063 if (ret < 0) 2188 2064 goto cleanup; 2189 2065 } ··· 2210 2086 */ 2211 2087 atomic_set(&rbio->stripes_pending, bios_to_read); 2212 2088 while ((bio = bio_list_pop(&bio_list))) { 2213 - bio->bi_private = rbio; 2214 2089 bio->bi_end_io = raid_recover_end_io; 2215 - bio->bi_opf = REQ_OP_READ; 2216 2090 2217 2091 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2218 2092 ··· 2237 2115 * of the drive. 2238 2116 */ 2239 2117 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2240 - u64 stripe_len, int mirror_num, int generic_io) 2118 + u32 stripe_len, int mirror_num, int generic_io) 2241 2119 { 2242 2120 struct btrfs_fs_info *fs_info = bioc->fs_info; 2243 2121 struct btrfs_raid_bio *rbio; ··· 2315 2193 2316 2194 } 2317 2195 2318 - static void rmw_work(struct btrfs_work *work) 2196 + static void rmw_work(struct work_struct *work) 2319 2197 { 2320 2198 struct btrfs_raid_bio *rbio; 2321 2199 ··· 2323 2201 raid56_rmw_stripe(rbio); 2324 2202 } 2325 2203 2326 - static void read_rebuild_work(struct btrfs_work *work) 2204 + static void read_rebuild_work(struct work_struct *work) 2327 2205 { 2328 2206 struct btrfs_raid_bio *rbio; 2329 2207 ··· 2343 2221 2344 2222 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2345 2223 struct btrfs_io_context *bioc, 2346 - u64 stripe_len, struct btrfs_device *scrub_dev, 2224 + u32 stripe_len, struct btrfs_device *scrub_dev, 2347 2225 unsigned long *dbitmap, int stripe_nsectors) 2348 2226 { 2349 2227 struct btrfs_fs_info *fs_info = bioc->fs_info; ··· 2374 2252 } 2375 2253 ASSERT(i < rbio->real_stripes); 2376 2254 2377 - /* Now we just support the sectorsize equals to page size */ 2378 - ASSERT(fs_info->sectorsize == PAGE_SIZE); 2379 - ASSERT(rbio->stripe_npages == stripe_nsectors); 2380 2255 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2381 2256 2382 2257 /* ··· 2387 2268 2388 2269 /* Used for both parity scrub and missing. */ 2389 2270 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2390 - u64 logical) 2271 + unsigned int pgoff, u64 logical) 2391 2272 { 2273 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2392 2274 int stripe_offset; 2393 2275 int index; 2394 2276 2395 2277 ASSERT(logical >= rbio->bioc->raid_map[0]); 2396 - ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + 2278 + ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 2397 2279 rbio->stripe_len * rbio->nr_data); 2398 2280 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 2399 - index = stripe_offset >> PAGE_SHIFT; 2400 - rbio->bio_pages[index] = page; 2281 + index = stripe_offset / sectorsize; 2282 + rbio->bio_sectors[index].page = page; 2283 + rbio->bio_sectors[index].pgoff = pgoff; 2401 2284 } 2402 2285 2403 2286 /* ··· 2408 2287 */ 2409 2288 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2410 2289 { 2411 - int i; 2412 - int bit; 2413 - int index; 2414 - struct page *page; 2290 + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2291 + int stripe; 2292 + int sectornr; 2415 2293 2416 - for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2417 - for (i = 0; i < rbio->real_stripes; i++) { 2418 - index = i * rbio->stripe_npages + bit; 2294 + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2295 + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2296 + struct page *page; 2297 + int index = (stripe * rbio->stripe_nsectors + sectornr) * 2298 + sectorsize >> PAGE_SHIFT; 2299 + 2419 2300 if (rbio->stripe_pages[index]) 2420 2301 continue; 2421 2302 ··· 2427 2304 rbio->stripe_pages[index] = page; 2428 2305 } 2429 2306 } 2307 + index_stripe_sectors(rbio); 2430 2308 return 0; 2431 2309 } 2432 2310 ··· 2435 2311 int need_check) 2436 2312 { 2437 2313 struct btrfs_io_context *bioc = rbio->bioc; 2314 + const u32 sectorsize = bioc->fs_info->sectorsize; 2438 2315 void **pointers = rbio->finish_pointers; 2439 2316 unsigned long *pbitmap = rbio->finish_pbitmap; 2440 2317 int nr_data = rbio->nr_data; 2441 2318 int stripe; 2442 - int pagenr; 2319 + int sectornr; 2443 2320 bool has_qstripe; 2444 - struct page *p_page = NULL; 2445 - struct page *q_page = NULL; 2321 + struct sector_ptr p_sector = { 0 }; 2322 + struct sector_ptr q_sector = { 0 }; 2446 2323 struct bio_list bio_list; 2447 2324 struct bio *bio; 2448 2325 int is_replace = 0; ··· 2460 2335 2461 2336 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 2462 2337 is_replace = 1; 2463 - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2338 + bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); 2464 2339 } 2465 2340 2466 2341 /* ··· 2473 2348 if (!need_check) 2474 2349 goto writeback; 2475 2350 2476 - p_page = alloc_page(GFP_NOFS); 2477 - if (!p_page) 2351 + p_sector.page = alloc_page(GFP_NOFS); 2352 + if (!p_sector.page) 2478 2353 goto cleanup; 2479 - SetPageUptodate(p_page); 2354 + p_sector.pgoff = 0; 2355 + p_sector.uptodate = 1; 2480 2356 2481 2357 if (has_qstripe) { 2482 2358 /* RAID6, allocate and map temp space for the Q stripe */ 2483 - q_page = alloc_page(GFP_NOFS); 2484 - if (!q_page) { 2485 - __free_page(p_page); 2359 + q_sector.page = alloc_page(GFP_NOFS); 2360 + if (!q_sector.page) { 2361 + __free_page(p_sector.page); 2362 + p_sector.page = NULL; 2486 2363 goto cleanup; 2487 2364 } 2488 - SetPageUptodate(q_page); 2489 - pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); 2365 + q_sector.pgoff = 0; 2366 + q_sector.uptodate = 1; 2367 + pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 2490 2368 } 2491 2369 2492 2370 atomic_set(&rbio->error, 0); 2493 2371 2494 2372 /* Map the parity stripe just once */ 2495 - pointers[nr_data] = kmap_local_page(p_page); 2373 + pointers[nr_data] = kmap_local_page(p_sector.page); 2496 2374 2497 - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2498 - struct page *p; 2375 + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2376 + struct sector_ptr *sector; 2499 2377 void *parity; 2378 + 2500 2379 /* first collect one page from each data stripe */ 2501 2380 for (stripe = 0; stripe < nr_data; stripe++) { 2502 - p = page_in_rbio(rbio, stripe, pagenr, 0); 2503 - pointers[stripe] = kmap_local_page(p); 2381 + sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2382 + pointers[stripe] = kmap_local_page(sector->page) + 2383 + sector->pgoff; 2504 2384 } 2505 2385 2506 2386 if (has_qstripe) { 2507 2387 /* RAID6, call the library function to fill in our P/Q */ 2508 - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2388 + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2509 2389 pointers); 2510 2390 } else { 2511 2391 /* raid5 */ 2512 - copy_page(pointers[nr_data], pointers[0]); 2513 - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2392 + memcpy(pointers[nr_data], pointers[0], sectorsize); 2393 + run_xor(pointers + 1, nr_data - 1, sectorsize); 2514 2394 } 2515 2395 2516 2396 /* Check scrubbing parity and repair it */ 2517 - p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2518 - parity = kmap_local_page(p); 2519 - if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2520 - copy_page(parity, pointers[rbio->scrubp]); 2397 + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2398 + parity = kmap_local_page(sector->page) + sector->pgoff; 2399 + if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2400 + memcpy(parity, pointers[rbio->scrubp], sectorsize); 2521 2401 else 2522 2402 /* Parity is right, needn't writeback */ 2523 - bitmap_clear(rbio->dbitmap, pagenr, 1); 2403 + bitmap_clear(rbio->dbitmap, sectornr, 1); 2524 2404 kunmap_local(parity); 2525 2405 2526 2406 for (stripe = nr_data - 1; stripe >= 0; stripe--) ··· 2533 2403 } 2534 2404 2535 2405 kunmap_local(pointers[nr_data]); 2536 - __free_page(p_page); 2537 - if (q_page) { 2406 + __free_page(p_sector.page); 2407 + p_sector.page = NULL; 2408 + if (q_sector.page) { 2538 2409 kunmap_local(pointers[rbio->real_stripes - 1]); 2539 - __free_page(q_page); 2410 + __free_page(q_sector.page); 2411 + q_sector.page = NULL; 2540 2412 } 2541 2413 2542 2414 writeback: ··· 2547 2415 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2548 2416 * everything else. 2549 2417 */ 2550 - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2551 - struct page *page; 2418 + for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2419 + struct sector_ptr *sector; 2552 2420 2553 - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2554 - ret = rbio_add_io_page(rbio, &bio_list, 2555 - page, rbio->scrubp, pagenr, rbio->stripe_len); 2421 + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2422 + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2423 + sectornr, rbio->stripe_len, REQ_OP_WRITE); 2556 2424 if (ret) 2557 2425 goto cleanup; 2558 2426 } ··· 2560 2428 if (!is_replace) 2561 2429 goto submit_write; 2562 2430 2563 - for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2564 - struct page *page; 2431 + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2432 + struct sector_ptr *sector; 2565 2433 2566 - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2567 - ret = rbio_add_io_page(rbio, &bio_list, page, 2434 + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2435 + ret = rbio_add_io_sector(rbio, &bio_list, sector, 2568 2436 bioc->tgtdev_map[rbio->scrubp], 2569 - pagenr, rbio->stripe_len); 2437 + sectornr, rbio->stripe_len, REQ_OP_WRITE); 2570 2438 if (ret) 2571 2439 goto cleanup; 2572 2440 } ··· 2582 2450 atomic_set(&rbio->stripes_pending, nr_data); 2583 2451 2584 2452 while ((bio = bio_list_pop(&bio_list))) { 2585 - bio->bi_private = rbio; 2586 2453 bio->bi_end_io = raid_write_end_io; 2587 - bio->bi_opf = REQ_OP_WRITE; 2588 2454 2589 2455 submit_bio(bio); 2590 2456 } ··· 2678 2548 if (bio->bi_status) 2679 2549 fail_bio_stripe(rbio, bio); 2680 2550 else 2681 - set_bio_pages_uptodate(bio); 2551 + set_bio_pages_uptodate(rbio, bio); 2682 2552 2683 2553 bio_put(bio); 2684 2554 ··· 2698 2568 int bios_to_read = 0; 2699 2569 struct bio_list bio_list; 2700 2570 int ret; 2701 - int pagenr; 2571 + int sectornr; 2702 2572 int stripe; 2703 2573 struct bio *bio; 2704 2574 ··· 2714 2584 * stripe 2715 2585 */ 2716 2586 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2717 - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2718 - struct page *page; 2587 + for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { 2588 + struct sector_ptr *sector; 2719 2589 /* 2720 - * we want to find all the pages missing from 2721 - * the rbio and read them from the disk. If 2722 - * page_in_rbio finds a page in the bio list 2723 - * we don't need to read it off the stripe. 2590 + * We want to find all the sectors missing from the 2591 + * rbio and read them from the disk. If * sector_in_rbio() 2592 + * finds a sector in the bio list we don't need to read 2593 + * it off the stripe. 2724 2594 */ 2725 - page = page_in_rbio(rbio, stripe, pagenr, 1); 2726 - if (page) 2595 + sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2596 + if (sector) 2727 2597 continue; 2728 2598 2729 - page = rbio_stripe_page(rbio, stripe, pagenr); 2599 + sector = rbio_stripe_sector(rbio, stripe, sectornr); 2730 2600 /* 2731 - * the bio cache may have handed us an uptodate 2732 - * page. If so, be happy and use it 2601 + * The bio cache may have handed us an uptodate sector. 2602 + * If so, be happy and use it. 2733 2603 */ 2734 - if (PageUptodate(page)) 2604 + if (sector->uptodate) 2735 2605 continue; 2736 2606 2737 - ret = rbio_add_io_page(rbio, &bio_list, page, 2738 - stripe, pagenr, rbio->stripe_len); 2607 + ret = rbio_add_io_sector(rbio, &bio_list, sector, 2608 + stripe, sectornr, rbio->stripe_len, 2609 + REQ_OP_READ); 2739 2610 if (ret) 2740 2611 goto cleanup; 2741 2612 } ··· 2759 2628 */ 2760 2629 atomic_set(&rbio->stripes_pending, bios_to_read); 2761 2630 while ((bio = bio_list_pop(&bio_list))) { 2762 - bio->bi_private = rbio; 2763 2631 bio->bi_end_io = raid56_parity_scrub_end_io; 2764 - bio->bi_opf = REQ_OP_READ; 2765 2632 2766 2633 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2767 2634 ··· 2780 2651 validate_rbio_for_parity_scrub(rbio); 2781 2652 } 2782 2653 2783 - static void scrub_parity_work(struct btrfs_work *work) 2654 + static void scrub_parity_work(struct work_struct *work) 2784 2655 { 2785 2656 struct btrfs_raid_bio *rbio; 2786 2657

+4 -5

fs/btrfs/raid56.h

··· 31 31 struct btrfs_device; 32 32 33 33 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 34 - u64 stripe_len, int mirror_num, int generic_io); 35 - int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, 36 - u64 stripe_len); 34 + u32 stripe_len, int mirror_num, int generic_io); 35 + int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); 37 36 38 37 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 39 - u64 logical); 38 + unsigned int pgoff, u64 logical); 40 39 41 40 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 42 - struct btrfs_io_context *bioc, u64 stripe_len, 41 + struct btrfs_io_context *bioc, u32 stripe_len, 43 42 struct btrfs_device *scrub_dev, 44 43 unsigned long *dbitmap, int stripe_nsectors); 45 44 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);

+11 -12

fs/btrfs/reflink.c

··· 614 614 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 615 615 struct inode *inode2, u64 loff2, u64 len) 616 616 { 617 + u64 range1_end = loff1 + len - 1; 618 + u64 range2_end = loff2 + len - 1; 619 + 617 620 if (inode1 < inode2) { 618 621 swap(inode1, inode2); 619 622 swap(loff1, loff2); 623 + swap(range1_end, range2_end); 620 624 } else if (inode1 == inode2 && loff2 < loff1) { 621 625 swap(loff1, loff2); 626 + swap(range1_end, range2_end); 622 627 } 623 - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); 624 - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); 628 + 629 + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); 630 + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); 631 + 632 + btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); 633 + btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); 625 634 } 626 635 627 636 static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) ··· 780 771 struct inode *inode_in = file_inode(file_in); 781 772 struct inode *inode_out = file_inode(file_out); 782 773 u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; 783 - bool same_inode = inode_out == inode_in; 784 774 u64 wb_len; 785 775 int ret; 786 776 ··· 816 808 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); 817 809 else 818 810 wb_len = ALIGN(*len, bs); 819 - 820 - /* 821 - * Since we don't lock ranges, wait for ongoing lockless dio writes (as 822 - * any in progress could create its ordered extents after we wait for 823 - * existing ordered extents below). 824 - */ 825 - inode_dio_wait(inode_in); 826 - if (!same_inode) 827 - inode_dio_wait(inode_out); 828 811 829 812 /* 830 813 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.

+15 -4

fs/btrfs/relocation.c

··· 362 362 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 363 363 if (rb_node) { 364 364 node = rb_entry(rb_node, struct mapping_node, rb_node); 365 - root = (struct btrfs_root *)node->data; 365 + root = node->data; 366 366 } 367 367 spin_unlock(&rc->reloc_root_tree.lock); 368 368 return btrfs_grab_root(root); ··· 2997 2997 2998 2998 /* Reserve metadata for this range */ 2999 2999 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3000 - clamped_len, clamped_len); 3000 + clamped_len, clamped_len, 3001 + false); 3001 3002 if (ret) 3002 3003 goto release_page; 3003 3004 ··· 3846 3845 btrfs_end_transaction(trans); 3847 3846 btrfs_btree_balance_dirty(fs_info); 3848 3847 if (err) { 3849 - if (inode) 3850 - iput(inode); 3848 + iput(inode); 3851 3849 inode = ERR_PTR(err); 3852 3850 } 3853 3851 return inode; ··· 3976 3976 bg = btrfs_lookup_block_group(fs_info, group_start); 3977 3977 if (!bg) 3978 3978 return -ENOENT; 3979 + 3980 + /* 3981 + * Relocation of a data block group creates ordered extents. Without 3982 + * sb_start_write(), we can freeze the filesystem while unfinished 3983 + * ordered extents are left. Such ordered extents can cause a deadlock 3984 + * e.g. when syncfs() is waiting for their completion but they can't 3985 + * finish because they block when joining a transaction, due to the 3986 + * fact that the freeze locks are being held in write mode. 3987 + */ 3988 + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 3989 + ASSERT(sb_write_started(fs_info->sb)); 3979 3990 3980 3991 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3981 3992 btrfs_put_block_group(bg);

+2 -1

fs/btrfs/root-tree.c

··· 509 509 /* One for parent inode, two for dir entries */ 510 510 qgroup_num_bytes = 3 * fs_info->nodesize; 511 511 ret = btrfs_qgroup_reserve_meta_prealloc(root, 512 - qgroup_num_bytes, true); 512 + qgroup_num_bytes, true, 513 + false); 513 514 if (ret) 514 515 return ret; 515 516 }

+977 -922

fs/btrfs/scrub.c

··· 45 45 * operations. The first one configures an upper limit for the number 46 46 * of (dynamically allocated) pages that are added to a bio. 47 47 */ 48 - #define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ 49 - #define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ 48 + #define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ 49 + #define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ 50 50 51 51 /* 52 52 * The following value times PAGE_SIZE needs to be large enough to match the 53 53 * largest node/leaf/sector size that shall be supported. 54 54 */ 55 - #define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 55 + #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 56 56 57 57 struct scrub_recover { 58 58 refcount_t refs; ··· 60 60 u64 map_length; 61 61 }; 62 62 63 - struct scrub_page { 63 + struct scrub_sector { 64 64 struct scrub_block *sblock; 65 65 struct page *page; 66 66 struct btrfs_device *dev; ··· 87 87 blk_status_t status; 88 88 u64 logical; 89 89 u64 physical; 90 - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91 - int page_count; 90 + struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; 91 + int sector_count; 92 92 int next_free; 93 - struct btrfs_work work; 93 + struct work_struct work; 94 94 }; 95 95 96 96 struct scrub_block { 97 - struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 98 - int page_count; 99 - atomic_t outstanding_pages; 97 + struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; 98 + int sector_count; 99 + atomic_t outstanding_sectors; 100 100 refcount_t refs; /* free mem on transition to zero */ 101 101 struct scrub_ctx *sctx; 102 102 struct scrub_parity *sparity; ··· 110 110 /* It is for the data with checksum */ 111 111 unsigned int data_corrected:1; 112 112 }; 113 - struct btrfs_work work; 113 + struct work_struct work; 114 114 }; 115 115 116 116 /* Used for the chunks with parity stripe such RAID5/6 */ ··· 129 129 130 130 refcount_t refs; 131 131 132 - struct list_head spages; 132 + struct list_head sectors_list; 133 133 134 134 /* Work of parity check and repair */ 135 - struct btrfs_work work; 135 + struct work_struct work; 136 136 137 137 /* Mark the parity blocks which have data */ 138 138 unsigned long *dbitmap; ··· 158 158 struct list_head csum_list; 159 159 atomic_t cancel_req; 160 160 int readonly; 161 - int pages_per_bio; 161 + int sectors_per_bio; 162 162 163 163 /* State of IO submission throttling affecting the associated device */ 164 164 ktime_t throttle_deadline; ··· 212 212 static void scrub_recheck_block_checksum(struct scrub_block *sblock); 213 213 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 214 214 struct scrub_block *sblock_good); 215 - static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 215 + static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, 216 216 struct scrub_block *sblock_good, 217 - int page_num, int force_write); 217 + int sector_num, int force_write); 218 218 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 219 - static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 220 - int page_num); 219 + static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, 220 + int sector_num); 221 221 static int scrub_checksum_data(struct scrub_block *sblock); 222 222 static int scrub_checksum_tree_block(struct scrub_block *sblock); 223 223 static int scrub_checksum_super(struct scrub_block *sblock); 224 224 static void scrub_block_put(struct scrub_block *sblock); 225 - static void scrub_page_get(struct scrub_page *spage); 226 - static void scrub_page_put(struct scrub_page *spage); 225 + static void scrub_sector_get(struct scrub_sector *sector); 226 + static void scrub_sector_put(struct scrub_sector *sector); 227 227 static void scrub_parity_get(struct scrub_parity *sparity); 228 228 static void scrub_parity_put(struct scrub_parity *sparity); 229 - static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, 230 - u64 physical, struct btrfs_device *dev, u64 flags, 231 - u64 gen, int mirror_num, u8 *csum, 232 - u64 physical_for_dev_replace); 229 + static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, 230 + u64 physical, struct btrfs_device *dev, u64 flags, 231 + u64 gen, int mirror_num, u8 *csum, 232 + u64 physical_for_dev_replace); 233 233 static void scrub_bio_end_io(struct bio *bio); 234 - static void scrub_bio_end_io_worker(struct btrfs_work *work); 234 + static void scrub_bio_end_io_worker(struct work_struct *work); 235 235 static void scrub_block_complete(struct scrub_block *sblock); 236 - static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 237 - u64 extent_logical, u32 extent_len, 238 - u64 *extent_physical, 239 - struct btrfs_device **extent_dev, 240 - int *extent_mirror_num); 241 - static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 242 - struct scrub_page *spage); 236 + static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, 237 + u64 extent_logical, u32 extent_len, 238 + u64 *extent_physical, 239 + struct btrfs_device **extent_dev, 240 + int *extent_mirror_num); 241 + static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, 242 + struct scrub_sector *sector); 243 243 static void scrub_wr_submit(struct scrub_ctx *sctx); 244 244 static void scrub_wr_bio_end_io(struct bio *bio); 245 - static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 245 + static void scrub_wr_bio_end_io_worker(struct work_struct *work); 246 246 static void scrub_put_ctx(struct scrub_ctx *sctx); 247 247 248 - static inline int scrub_is_page_on_raid56(struct scrub_page *spage) 248 + static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) 249 249 { 250 - return spage->recover && 251 - (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); 250 + return sector->recover && 251 + (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); 252 252 } 253 253 254 254 static void scrub_pending_bio_inc(struct scrub_ctx *sctx) ··· 535 535 if (sctx->curr != -1) { 536 536 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 537 537 538 - for (i = 0; i < sbio->page_count; i++) { 539 - WARN_ON(!sbio->pagev[i]->page); 540 - scrub_block_put(sbio->pagev[i]->sblock); 538 + for (i = 0; i < sbio->sector_count; i++) { 539 + WARN_ON(!sbio->sectors[i]->page); 540 + scrub_block_put(sbio->sectors[i]->sblock); 541 541 } 542 542 bio_put(sbio->bio); 543 543 } ··· 572 572 goto nomem; 573 573 refcount_set(&sctx->refs, 1); 574 574 sctx->is_dev_replace = is_dev_replace; 575 - sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; 575 + sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; 576 576 sctx->curr = -1; 577 577 sctx->fs_info = fs_info; 578 578 INIT_LIST_HEAD(&sctx->csum_list); ··· 586 586 587 587 sbio->index = i; 588 588 sbio->sctx = sctx; 589 - sbio->page_count = 0; 590 - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, 591 - NULL); 589 + sbio->sector_count = 0; 590 + INIT_WORK(&sbio->work, scrub_bio_end_io_worker); 592 591 593 592 if (i != SCRUB_BIOS_PER_SCTX - 1) 594 593 sctx->bios[i]->next_free = i + 1; ··· 727 728 u8 ref_level = 0; 728 729 int ret; 729 730 730 - WARN_ON(sblock->page_count < 1); 731 - dev = sblock->pagev[0]->dev; 731 + WARN_ON(sblock->sector_count < 1); 732 + dev = sblock->sectors[0]->dev; 732 733 fs_info = sblock->sctx->fs_info; 733 734 734 735 path = btrfs_alloc_path(); 735 736 if (!path) 736 737 return; 737 738 738 - swarn.physical = sblock->pagev[0]->physical; 739 - swarn.logical = sblock->pagev[0]->logical; 739 + swarn.physical = sblock->sectors[0]->physical; 740 + swarn.logical = sblock->sectors[0]->logical; 740 741 swarn.errstr = errstr; 741 742 swarn.dev = NULL; 742 743 ··· 797 798 798 799 /* 799 800 * scrub_handle_errored_block gets called when either verification of the 800 - * pages failed or the bio failed to read, e.g. with EIO. In the latter 801 - * case, this function handles all pages in the bio, even though only one 801 + * sectors failed or the bio failed to read, e.g. with EIO. In the latter 802 + * case, this function handles all sectors in the bio, even though only one 802 803 * may be bad. 803 804 * The goal of this function is to repair the errored block by using the 804 805 * contents of one of the mirrors. ··· 816 817 struct scrub_block *sblock_bad; 817 818 int ret; 818 819 int mirror_index; 819 - int page_num; 820 + int sector_num; 820 821 int success; 821 822 bool full_stripe_locked; 822 823 unsigned int nofs_flag; 823 824 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 824 825 DEFAULT_RATELIMIT_BURST); 825 826 826 - BUG_ON(sblock_to_check->page_count < 1); 827 + BUG_ON(sblock_to_check->sector_count < 1); 827 828 fs_info = sctx->fs_info; 828 - if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 829 + if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 829 830 /* 830 831 * if we find an error in a super block, we just report it. 831 832 * They will get written with the next transaction commit ··· 836 837 spin_unlock(&sctx->stat_lock); 837 838 return 0; 838 839 } 839 - logical = sblock_to_check->pagev[0]->logical; 840 - BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 841 - failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 842 - is_metadata = !(sblock_to_check->pagev[0]->flags & 840 + logical = sblock_to_check->sectors[0]->logical; 841 + BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); 842 + failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; 843 + is_metadata = !(sblock_to_check->sectors[0]->flags & 843 844 BTRFS_EXTENT_FLAG_DATA); 844 - have_csum = sblock_to_check->pagev[0]->have_csum; 845 - dev = sblock_to_check->pagev[0]->dev; 845 + have_csum = sblock_to_check->sectors[0]->have_csum; 846 + dev = sblock_to_check->sectors[0]->dev; 846 847 847 848 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) 848 849 return 0; ··· 853 854 * might be waiting the scrub task to pause (which needs to wait for all 854 855 * the worker tasks to complete before pausing). 855 856 * We do allocations in the workers through insert_full_stripe_lock() 856 - * and scrub_add_page_to_wr_bio(), which happens down the call chain of 857 + * and scrub_add_sector_to_wr_bio(), which happens down the call chain of 857 858 * this function. 858 859 */ 859 860 nofs_flag = memalloc_nofs_save(); ··· 917 918 goto out; 918 919 } 919 920 920 - /* setup the context, map the logical blocks and alloc the pages */ 921 + /* Setup the context, map the logical blocks and alloc the sectors */ 921 922 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); 922 923 if (ret) { 923 924 spin_lock(&sctx->stat_lock); ··· 936 937 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 937 938 sblock_bad->no_io_error_seen) { 938 939 /* 939 - * the error disappeared after reading page by page, or 940 + * The error disappeared after reading sector by sector, or 940 941 * the area was part of a huge bio and other parts of the 941 942 * bio caused I/O errors, or the block layer merged several 942 943 * read requests into one and the error is caused by a ··· 997 998 * that is known to contain an error is rewritten. Afterwards 998 999 * the block is known to be corrected. 999 1000 * If a mirror is found which is completely correct, and no 1000 - * checksum is present, only those pages are rewritten that had 1001 + * checksum is present, only those sectors are rewritten that had 1001 1002 * an I/O error in the block to be repaired, since it cannot be 1002 - * determined, which copy of the other pages is better (and it 1003 - * could happen otherwise that a correct page would be 1003 + * determined, which copy of the other sectors is better (and it 1004 + * could happen otherwise that a correct sector would be 1004 1005 * overwritten by a bad one). 1005 1006 */ 1006 1007 for (mirror_index = 0; ;mirror_index++) { ··· 1010 1011 continue; 1011 1012 1012 1013 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ 1013 - if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { 1014 + if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { 1014 1015 if (mirror_index >= BTRFS_MAX_MIRRORS) 1015 1016 break; 1016 - if (!sblocks_for_recheck[mirror_index].page_count) 1017 + if (!sblocks_for_recheck[mirror_index].sector_count) 1017 1018 break; 1018 1019 1019 1020 sblock_other = sblocks_for_recheck + mirror_index; 1020 1021 } else { 1021 - struct scrub_recover *r = sblock_bad->pagev[0]->recover; 1022 + struct scrub_recover *r = sblock_bad->sectors[0]->recover; 1022 1023 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; 1023 1024 1024 1025 if (mirror_index >= max_allowed) 1025 1026 break; 1026 - if (!sblocks_for_recheck[1].page_count) 1027 + if (!sblocks_for_recheck[1].sector_count) 1027 1028 break; 1028 1029 1029 1030 ASSERT(failed_mirror_index == 0); 1030 1031 sblock_other = sblocks_for_recheck + 1; 1031 - sblock_other->pagev[0]->mirror_num = 1 + mirror_index; 1032 + sblock_other->sectors[0]->mirror_num = 1 + mirror_index; 1032 1033 } 1033 1034 1034 1035 /* build and submit the bios, check checksums */ ··· 1077 1078 * area are unreadable. 1078 1079 */ 1079 1080 success = 1; 1080 - for (page_num = 0; page_num < sblock_bad->page_count; 1081 - page_num++) { 1082 - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; 1081 + for (sector_num = 0; sector_num < sblock_bad->sector_count; 1082 + sector_num++) { 1083 + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; 1083 1084 struct scrub_block *sblock_other = NULL; 1084 1085 1085 - /* skip no-io-error page in scrub */ 1086 - if (!spage_bad->io_error && !sctx->is_dev_replace) 1086 + /* Skip no-io-error sectors in scrub */ 1087 + if (!sector_bad->io_error && !sctx->is_dev_replace) 1087 1088 continue; 1088 1089 1089 - if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { 1090 + if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { 1090 1091 /* 1091 1092 * In case of dev replace, if raid56 rebuild process 1092 1093 * didn't work out correct data, then copy the content ··· 1095 1096 * sblock_for_recheck array to target device. 1096 1097 */ 1097 1098 sblock_other = NULL; 1098 - } else if (spage_bad->io_error) { 1099 - /* try to find no-io-error page in mirrors */ 1099 + } else if (sector_bad->io_error) { 1100 + /* Try to find no-io-error sector in mirrors */ 1100 1101 for (mirror_index = 0; 1101 1102 mirror_index < BTRFS_MAX_MIRRORS && 1102 - sblocks_for_recheck[mirror_index].page_count > 0; 1103 + sblocks_for_recheck[mirror_index].sector_count > 0; 1103 1104 mirror_index++) { 1104 1105 if (!sblocks_for_recheck[mirror_index]. 1105 - pagev[page_num]->io_error) { 1106 + sectors[sector_num]->io_error) { 1106 1107 sblock_other = sblocks_for_recheck + 1107 1108 mirror_index; 1108 1109 break; ··· 1114 1115 1115 1116 if (sctx->is_dev_replace) { 1116 1117 /* 1117 - * did not find a mirror to fetch the page 1118 - * from. scrub_write_page_to_dev_replace() 1119 - * handles this case (page->io_error), by 1120 - * filling the block with zeros before 1121 - * submitting the write request 1118 + * Did not find a mirror to fetch the sector from. 1119 + * scrub_write_sector_to_dev_replace() handles this 1120 + * case (sector->io_error), by filling the block with 1121 + * zeros before submitting the write request 1122 1122 */ 1123 1123 if (!sblock_other) 1124 1124 sblock_other = sblock_bad; 1125 1125 1126 - if (scrub_write_page_to_dev_replace(sblock_other, 1127 - page_num) != 0) { 1126 + if (scrub_write_sector_to_dev_replace(sblock_other, 1127 + sector_num) != 0) { 1128 1128 atomic64_inc( 1129 1129 &fs_info->dev_replace.num_write_errors); 1130 1130 success = 0; 1131 1131 } 1132 1132 } else if (sblock_other) { 1133 - ret = scrub_repair_page_from_good_copy(sblock_bad, 1134 - sblock_other, 1135 - page_num, 0); 1133 + ret = scrub_repair_sector_from_good_copy(sblock_bad, 1134 + sblock_other, 1135 + sector_num, 0); 1136 1136 if (0 == ret) 1137 - spage_bad->io_error = 0; 1137 + sector_bad->io_error = 0; 1138 1138 else 1139 1139 success = 0; 1140 1140 } ··· 1184 1186 struct scrub_block *sblock = sblocks_for_recheck + 1185 1187 mirror_index; 1186 1188 struct scrub_recover *recover; 1187 - int page_index; 1189 + int i; 1188 1190 1189 - for (page_index = 0; page_index < sblock->page_count; 1190 - page_index++) { 1191 - sblock->pagev[page_index]->sblock = NULL; 1192 - recover = sblock->pagev[page_index]->recover; 1191 + for (i = 0; i < sblock->sector_count; i++) { 1192 + sblock->sectors[i]->sblock = NULL; 1193 + recover = sblock->sectors[i]->recover; 1193 1194 if (recover) { 1194 1195 scrub_put_recover(fs_info, recover); 1195 - sblock->pagev[page_index]->recover = 1196 - NULL; 1196 + sblock->sectors[i]->recover = NULL; 1197 1197 } 1198 - scrub_page_put(sblock->pagev[page_index]); 1198 + scrub_sector_put(sblock->sectors[i]); 1199 1199 } 1200 1200 } 1201 1201 kfree(sblocks_for_recheck); ··· 1251 1255 { 1252 1256 struct scrub_ctx *sctx = original_sblock->sctx; 1253 1257 struct btrfs_fs_info *fs_info = sctx->fs_info; 1254 - u64 length = original_sblock->page_count * fs_info->sectorsize; 1255 - u64 logical = original_sblock->pagev[0]->logical; 1256 - u64 generation = original_sblock->pagev[0]->generation; 1257 - u64 flags = original_sblock->pagev[0]->flags; 1258 - u64 have_csum = original_sblock->pagev[0]->have_csum; 1258 + u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; 1259 + u64 logical = original_sblock->sectors[0]->logical; 1260 + u64 generation = original_sblock->sectors[0]->generation; 1261 + u64 flags = original_sblock->sectors[0]->flags; 1262 + u64 have_csum = original_sblock->sectors[0]->have_csum; 1259 1263 struct scrub_recover *recover; 1260 1264 struct btrfs_io_context *bioc; 1261 1265 u64 sublen; 1262 1266 u64 mapped_length; 1263 1267 u64 stripe_offset; 1264 1268 int stripe_index; 1265 - int page_index = 0; 1269 + int sector_index = 0; 1266 1270 int mirror_index; 1267 1271 int nmirrors; 1268 1272 int ret; 1269 1273 1270 1274 /* 1271 - * note: the two members refs and outstanding_pages 1272 - * are not used (and not set) in the blocks that are used for 1273 - * the recheck procedure 1275 + * Note: the two members refs and outstanding_sectors are not used (and 1276 + * not set) in the blocks that are used for the recheck procedure. 1274 1277 */ 1275 1278 1276 1279 while (length > 0) { ··· 1301 1306 recover->bioc = bioc; 1302 1307 recover->map_length = mapped_length; 1303 1308 1304 - ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); 1309 + ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); 1305 1310 1306 1311 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); 1307 1312 1308 1313 for (mirror_index = 0; mirror_index < nmirrors; 1309 1314 mirror_index++) { 1310 1315 struct scrub_block *sblock; 1311 - struct scrub_page *spage; 1316 + struct scrub_sector *sector; 1312 1317 1313 1318 sblock = sblocks_for_recheck + mirror_index; 1314 1319 sblock->sctx = sctx; 1315 1320 1316 - spage = kzalloc(sizeof(*spage), GFP_NOFS); 1317 - if (!spage) { 1321 + sector = kzalloc(sizeof(*sector), GFP_NOFS); 1322 + if (!sector) { 1318 1323 leave_nomem: 1319 1324 spin_lock(&sctx->stat_lock); 1320 1325 sctx->stat.malloc_errors++; ··· 1322 1327 scrub_put_recover(fs_info, recover); 1323 1328 return -ENOMEM; 1324 1329 } 1325 - scrub_page_get(spage); 1326 - sblock->pagev[page_index] = spage; 1327 - spage->sblock = sblock; 1328 - spage->flags = flags; 1329 - spage->generation = generation; 1330 - spage->logical = logical; 1331 - spage->have_csum = have_csum; 1330 + scrub_sector_get(sector); 1331 + sblock->sectors[sector_index] = sector; 1332 + sector->sblock = sblock; 1333 + sector->flags = flags; 1334 + sector->generation = generation; 1335 + sector->logical = logical; 1336 + sector->have_csum = have_csum; 1332 1337 if (have_csum) 1333 - memcpy(spage->csum, 1334 - original_sblock->pagev[0]->csum, 1338 + memcpy(sector->csum, 1339 + original_sblock->sectors[0]->csum, 1335 1340 sctx->fs_info->csum_size); 1336 1341 1337 1342 scrub_stripe_index_and_offset(logical, ··· 1343 1348 mirror_index, 1344 1349 &stripe_index, 1345 1350 &stripe_offset); 1346 - spage->physical = bioc->stripes[stripe_index].physical + 1351 + sector->physical = bioc->stripes[stripe_index].physical + 1347 1352 stripe_offset; 1348 - spage->dev = bioc->stripes[stripe_index].dev; 1353 + sector->dev = bioc->stripes[stripe_index].dev; 1349 1354 1350 - BUG_ON(page_index >= original_sblock->page_count); 1351 - spage->physical_for_dev_replace = 1352 - original_sblock->pagev[page_index]-> 1355 + BUG_ON(sector_index >= original_sblock->sector_count); 1356 + sector->physical_for_dev_replace = 1357 + original_sblock->sectors[sector_index]-> 1353 1358 physical_for_dev_replace; 1354 - /* for missing devices, dev->bdev is NULL */ 1355 - spage->mirror_num = mirror_index + 1; 1356 - sblock->page_count++; 1357 - spage->page = alloc_page(GFP_NOFS); 1358 - if (!spage->page) 1359 + /* For missing devices, dev->bdev is NULL */ 1360 + sector->mirror_num = mirror_index + 1; 1361 + sblock->sector_count++; 1362 + sector->page = alloc_page(GFP_NOFS); 1363 + if (!sector->page) 1359 1364 goto leave_nomem; 1360 1365 1361 1366 scrub_get_recover(recover); 1362 - spage->recover = recover; 1367 + sector->recover = recover; 1363 1368 } 1364 1369 scrub_put_recover(fs_info, recover); 1365 1370 length -= sublen; 1366 1371 logical += sublen; 1367 - page_index++; 1372 + sector_index++; 1368 1373 } 1369 1374 1370 1375 return 0; ··· 1377 1382 1378 1383 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, 1379 1384 struct bio *bio, 1380 - struct scrub_page *spage) 1385 + struct scrub_sector *sector) 1381 1386 { 1382 1387 DECLARE_COMPLETION_ONSTACK(done); 1383 1388 int ret; 1384 1389 int mirror_num; 1385 1390 1386 - bio->bi_iter.bi_sector = spage->logical >> 9; 1391 + bio->bi_iter.bi_sector = sector->logical >> 9; 1387 1392 bio->bi_private = &done; 1388 1393 bio->bi_end_io = scrub_bio_wait_endio; 1389 1394 1390 - mirror_num = spage->sblock->pagev[0]->mirror_num; 1391 - ret = raid56_parity_recover(bio, spage->recover->bioc, 1392 - spage->recover->map_length, 1395 + mirror_num = sector->sblock->sectors[0]->mirror_num; 1396 + ret = raid56_parity_recover(bio, sector->recover->bioc, 1397 + sector->recover->map_length, 1393 1398 mirror_num, 0); 1394 1399 if (ret) 1395 1400 return ret; ··· 1401 1406 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, 1402 1407 struct scrub_block *sblock) 1403 1408 { 1404 - struct scrub_page *first_page = sblock->pagev[0]; 1409 + struct scrub_sector *first_sector = sblock->sectors[0]; 1405 1410 struct bio *bio; 1406 - int page_num; 1411 + int i; 1407 1412 1408 - /* All pages in sblock belong to the same stripe on the same device. */ 1409 - ASSERT(first_page->dev); 1410 - if (!first_page->dev->bdev) 1413 + /* All sectors in sblock belong to the same stripe on the same device. */ 1414 + ASSERT(first_sector->dev); 1415 + if (!first_sector->dev->bdev) 1411 1416 goto out; 1412 1417 1413 - bio = btrfs_bio_alloc(BIO_MAX_VECS); 1414 - bio_set_dev(bio, first_page->dev->bdev); 1418 + bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); 1415 1419 1416 - for (page_num = 0; page_num < sblock->page_count; page_num++) { 1417 - struct scrub_page *spage = sblock->pagev[page_num]; 1420 + for (i = 0; i < sblock->sector_count; i++) { 1421 + struct scrub_sector *sector = sblock->sectors[i]; 1418 1422 1419 - WARN_ON(!spage->page); 1420 - bio_add_page(bio, spage->page, PAGE_SIZE, 0); 1423 + WARN_ON(!sector->page); 1424 + bio_add_page(bio, sector->page, PAGE_SIZE, 0); 1421 1425 } 1422 1426 1423 - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { 1427 + if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { 1424 1428 bio_put(bio); 1425 1429 goto out; 1426 1430 } ··· 1430 1436 1431 1437 return; 1432 1438 out: 1433 - for (page_num = 0; page_num < sblock->page_count; page_num++) 1434 - sblock->pagev[page_num]->io_error = 1; 1439 + for (i = 0; i < sblock->sector_count; i++) 1440 + sblock->sectors[i]->io_error = 1; 1435 1441 1436 1442 sblock->no_io_error_seen = 0; 1437 1443 } 1438 1444 1439 1445 /* 1440 - * this function will check the on disk data for checksum errors, header 1441 - * errors and read I/O errors. If any I/O errors happen, the exact pages 1442 - * which are errored are marked as being bad. The goal is to enable scrub 1443 - * to take those pages that are not errored from all the mirrors so that 1444 - * the pages that are errored in the just handled mirror can be repaired. 1446 + * This function will check the on disk data for checksum errors, header errors 1447 + * and read I/O errors. If any I/O errors happen, the exact sectors which are 1448 + * errored are marked as being bad. The goal is to enable scrub to take those 1449 + * sectors that are not errored from all the mirrors so that the sectors that 1450 + * are errored in the just handled mirror can be repaired. 1445 1451 */ 1446 1452 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1447 1453 struct scrub_block *sblock, 1448 1454 int retry_failed_mirror) 1449 1455 { 1450 - int page_num; 1456 + int i; 1451 1457 1452 1458 sblock->no_io_error_seen = 1; 1453 1459 1454 1460 /* short cut for raid56 */ 1455 - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) 1461 + if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) 1456 1462 return scrub_recheck_block_on_raid56(fs_info, sblock); 1457 1463 1458 - for (page_num = 0; page_num < sblock->page_count; page_num++) { 1459 - struct bio *bio; 1460 - struct scrub_page *spage = sblock->pagev[page_num]; 1464 + for (i = 0; i < sblock->sector_count; i++) { 1465 + struct scrub_sector *sector = sblock->sectors[i]; 1466 + struct bio bio; 1467 + struct bio_vec bvec; 1461 1468 1462 - if (spage->dev->bdev == NULL) { 1463 - spage->io_error = 1; 1469 + if (sector->dev->bdev == NULL) { 1470 + sector->io_error = 1; 1464 1471 sblock->no_io_error_seen = 0; 1465 1472 continue; 1466 1473 } 1467 1474 1468 - WARN_ON(!spage->page); 1469 - bio = btrfs_bio_alloc(1); 1470 - bio_set_dev(bio, spage->dev->bdev); 1475 + WARN_ON(!sector->page); 1476 + bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); 1477 + bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); 1478 + bio.bi_iter.bi_sector = sector->physical >> 9; 1471 1479 1472 - bio_add_page(bio, spage->page, fs_info->sectorsize, 0); 1473 - bio->bi_iter.bi_sector = spage->physical >> 9; 1474 - bio->bi_opf = REQ_OP_READ; 1475 - 1476 - if (btrfsic_submit_bio_wait(bio)) { 1477 - spage->io_error = 1; 1480 + btrfsic_check_bio(&bio); 1481 + if (submit_bio_wait(&bio)) { 1482 + sector->io_error = 1; 1478 1483 sblock->no_io_error_seen = 0; 1479 1484 } 1480 1485 1481 - bio_put(bio); 1486 + bio_uninit(&bio); 1482 1487 } 1483 1488 1484 1489 if (sblock->no_io_error_seen) 1485 1490 scrub_recheck_block_checksum(sblock); 1486 1491 } 1487 1492 1488 - static inline int scrub_check_fsid(u8 fsid[], 1489 - struct scrub_page *spage) 1493 + static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) 1490 1494 { 1491 - struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; 1495 + struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; 1492 1496 int ret; 1493 1497 1494 1498 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); ··· 1499 1507 sblock->checksum_error = 0; 1500 1508 sblock->generation_error = 0; 1501 1509 1502 - if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) 1510 + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) 1503 1511 scrub_checksum_data(sblock); 1504 1512 else 1505 1513 scrub_checksum_tree_block(sblock); ··· 1508 1516 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1509 1517 struct scrub_block *sblock_good) 1510 1518 { 1511 - int page_num; 1519 + int i; 1512 1520 int ret = 0; 1513 1521 1514 - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1522 + for (i = 0; i < sblock_bad->sector_count; i++) { 1515 1523 int ret_sub; 1516 1524 1517 - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1518 - sblock_good, 1519 - page_num, 1); 1525 + ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, 1526 + sblock_good, i, 1); 1520 1527 if (ret_sub) 1521 1528 ret = ret_sub; 1522 1529 } ··· 1523 1532 return ret; 1524 1533 } 1525 1534 1526 - static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1527 - struct scrub_block *sblock_good, 1528 - int page_num, int force_write) 1535 + static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, 1536 + struct scrub_block *sblock_good, 1537 + int sector_num, int force_write) 1529 1538 { 1530 - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; 1531 - struct scrub_page *spage_good = sblock_good->pagev[page_num]; 1539 + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; 1540 + struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; 1532 1541 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; 1533 1542 const u32 sectorsize = fs_info->sectorsize; 1534 1543 1535 - BUG_ON(spage_bad->page == NULL); 1536 - BUG_ON(spage_good->page == NULL); 1544 + BUG_ON(sector_bad->page == NULL); 1545 + BUG_ON(sector_good->page == NULL); 1537 1546 if (force_write || sblock_bad->header_error || 1538 - sblock_bad->checksum_error || spage_bad->io_error) { 1539 - struct bio *bio; 1547 + sblock_bad->checksum_error || sector_bad->io_error) { 1548 + struct bio bio; 1549 + struct bio_vec bvec; 1540 1550 int ret; 1541 1551 1542 - if (!spage_bad->dev->bdev) { 1552 + if (!sector_bad->dev->bdev) { 1543 1553 btrfs_warn_rl(fs_info, 1544 1554 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); 1545 1555 return -EIO; 1546 1556 } 1547 1557 1548 - bio = btrfs_bio_alloc(1); 1549 - bio_set_dev(bio, spage_bad->dev->bdev); 1550 - bio->bi_iter.bi_sector = spage_bad->physical >> 9; 1551 - bio->bi_opf = REQ_OP_WRITE; 1558 + bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); 1559 + bio.bi_iter.bi_sector = sector_bad->physical >> 9; 1560 + __bio_add_page(&bio, sector_good->page, sectorsize, 0); 1552 1561 1553 - ret = bio_add_page(bio, spage_good->page, sectorsize, 0); 1554 - if (ret != sectorsize) { 1555 - bio_put(bio); 1556 - return -EIO; 1557 - } 1562 + btrfsic_check_bio(&bio); 1563 + ret = submit_bio_wait(&bio); 1564 + bio_uninit(&bio); 1558 1565 1559 - if (btrfsic_submit_bio_wait(bio)) { 1560 - btrfs_dev_stat_inc_and_print(spage_bad->dev, 1566 + if (ret) { 1567 + btrfs_dev_stat_inc_and_print(sector_bad->dev, 1561 1568 BTRFS_DEV_STAT_WRITE_ERRS); 1562 1569 atomic64_inc(&fs_info->dev_replace.num_write_errors); 1563 - bio_put(bio); 1564 1570 return -EIO; 1565 1571 } 1566 - bio_put(bio); 1567 1572 } 1568 1573 1569 1574 return 0; ··· 1568 1581 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1569 1582 { 1570 1583 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 1571 - int page_num; 1584 + int i; 1572 1585 1573 1586 /* 1574 1587 * This block is used for the check of the parity on the source device, ··· 1577 1590 if (sblock->sparity) 1578 1591 return; 1579 1592 1580 - for (page_num = 0; page_num < sblock->page_count; page_num++) { 1593 + for (i = 0; i < sblock->sector_count; i++) { 1581 1594 int ret; 1582 1595 1583 - ret = scrub_write_page_to_dev_replace(sblock, page_num); 1596 + ret = scrub_write_sector_to_dev_replace(sblock, i); 1584 1597 if (ret) 1585 1598 atomic64_inc(&fs_info->dev_replace.num_write_errors); 1586 1599 } 1587 1600 } 1588 1601 1589 - static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1590 - int page_num) 1602 + static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) 1591 1603 { 1592 - struct scrub_page *spage = sblock->pagev[page_num]; 1604 + struct scrub_sector *sector = sblock->sectors[sector_num]; 1593 1605 1594 - BUG_ON(spage->page == NULL); 1595 - if (spage->io_error) 1596 - clear_page(page_address(spage->page)); 1606 + BUG_ON(sector->page == NULL); 1607 + if (sector->io_error) 1608 + clear_page(page_address(sector->page)); 1597 1609 1598 - return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1610 + return scrub_add_sector_to_wr_bio(sblock->sctx, sector); 1599 1611 } 1600 1612 1601 1613 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) ··· 1619 1633 return ret; 1620 1634 } 1621 1635 1622 - static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1623 - struct scrub_page *spage) 1636 + static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, 1637 + struct scrub_sector *sector) 1624 1638 { 1625 1639 struct scrub_bio *sbio; 1626 1640 int ret; ··· 1636 1650 return -ENOMEM; 1637 1651 } 1638 1652 sctx->wr_curr_bio->sctx = sctx; 1639 - sctx->wr_curr_bio->page_count = 0; 1653 + sctx->wr_curr_bio->sector_count = 0; 1640 1654 } 1641 1655 sbio = sctx->wr_curr_bio; 1642 - if (sbio->page_count == 0) { 1643 - struct bio *bio; 1644 - 1645 - ret = fill_writer_pointer_gap(sctx, 1646 - spage->physical_for_dev_replace); 1656 + if (sbio->sector_count == 0) { 1657 + ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); 1647 1658 if (ret) { 1648 1659 mutex_unlock(&sctx->wr_lock); 1649 1660 return ret; 1650 1661 } 1651 1662 1652 - sbio->physical = spage->physical_for_dev_replace; 1653 - sbio->logical = spage->logical; 1663 + sbio->physical = sector->physical_for_dev_replace; 1664 + sbio->logical = sector->logical; 1654 1665 sbio->dev = sctx->wr_tgtdev; 1655 - bio = sbio->bio; 1656 - if (!bio) { 1657 - bio = btrfs_bio_alloc(sctx->pages_per_bio); 1658 - sbio->bio = bio; 1666 + if (!sbio->bio) { 1667 + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, 1668 + REQ_OP_WRITE, GFP_NOFS); 1659 1669 } 1660 - 1661 - bio->bi_private = sbio; 1662 - bio->bi_end_io = scrub_wr_bio_end_io; 1663 - bio_set_dev(bio, sbio->dev->bdev); 1664 - bio->bi_iter.bi_sector = sbio->physical >> 9; 1665 - bio->bi_opf = REQ_OP_WRITE; 1670 + sbio->bio->bi_private = sbio; 1671 + sbio->bio->bi_end_io = scrub_wr_bio_end_io; 1672 + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; 1666 1673 sbio->status = 0; 1667 - } else if (sbio->physical + sbio->page_count * sectorsize != 1668 - spage->physical_for_dev_replace || 1669 - sbio->logical + sbio->page_count * sectorsize != 1670 - spage->logical) { 1674 + } else if (sbio->physical + sbio->sector_count * sectorsize != 1675 + sector->physical_for_dev_replace || 1676 + sbio->logical + sbio->sector_count * sectorsize != 1677 + sector->logical) { 1671 1678 scrub_wr_submit(sctx); 1672 1679 goto again; 1673 1680 } 1674 1681 1675 - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); 1682 + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); 1676 1683 if (ret != sectorsize) { 1677 - if (sbio->page_count < 1) { 1684 + if (sbio->sector_count < 1) { 1678 1685 bio_put(sbio->bio); 1679 1686 sbio->bio = NULL; 1680 1687 mutex_unlock(&sctx->wr_lock); ··· 1677 1698 goto again; 1678 1699 } 1679 1700 1680 - sbio->pagev[sbio->page_count] = spage; 1681 - scrub_page_get(spage); 1682 - sbio->page_count++; 1683 - if (sbio->page_count == sctx->pages_per_bio) 1701 + sbio->sectors[sbio->sector_count] = sector; 1702 + scrub_sector_get(sector); 1703 + sbio->sector_count++; 1704 + if (sbio->sector_count == sctx->sectors_per_bio) 1684 1705 scrub_wr_submit(sctx); 1685 1706 mutex_unlock(&sctx->wr_lock); 1686 1707 ··· 1696 1717 1697 1718 sbio = sctx->wr_curr_bio; 1698 1719 sctx->wr_curr_bio = NULL; 1699 - WARN_ON(!sbio->bio->bi_bdev); 1700 1720 scrub_pending_bio_inc(sctx); 1701 1721 /* process all writes in a single worker thread. Then the block layer 1702 1722 * orders the requests before sending them to the driver which 1703 1723 * doubled the write performance on spinning disks when measured 1704 1724 * with Linux 3.5 */ 1705 - btrfsic_submit_bio(sbio->bio); 1725 + btrfsic_check_bio(sbio->bio); 1726 + submit_bio(sbio->bio); 1706 1727 1707 1728 if (btrfs_is_zoned(sctx->fs_info)) 1708 - sctx->write_pointer = sbio->physical + sbio->page_count * 1729 + sctx->write_pointer = sbio->physical + sbio->sector_count * 1709 1730 sctx->fs_info->sectorsize; 1710 1731 } 1711 1732 ··· 1717 1738 sbio->status = bio->bi_status; 1718 1739 sbio->bio = bio; 1719 1740 1720 - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); 1721 - btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1741 + INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); 1742 + queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1722 1743 } 1723 1744 1724 - static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1745 + static void scrub_wr_bio_end_io_worker(struct work_struct *work) 1725 1746 { 1726 1747 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1727 1748 struct scrub_ctx *sctx = sbio->sctx; 1728 1749 int i; 1729 1750 1730 - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); 1751 + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); 1731 1752 if (sbio->status) { 1732 1753 struct btrfs_dev_replace *dev_replace = 1733 1754 &sbio->sctx->fs_info->dev_replace; 1734 1755 1735 - for (i = 0; i < sbio->page_count; i++) { 1736 - struct scrub_page *spage = sbio->pagev[i]; 1756 + for (i = 0; i < sbio->sector_count; i++) { 1757 + struct scrub_sector *sector = sbio->sectors[i]; 1737 1758 1738 - spage->io_error = 1; 1759 + sector->io_error = 1; 1739 1760 atomic64_inc(&dev_replace->num_write_errors); 1740 1761 } 1741 1762 } 1742 1763 1743 - for (i = 0; i < sbio->page_count; i++) 1744 - scrub_page_put(sbio->pagev[i]); 1764 + for (i = 0; i < sbio->sector_count; i++) 1765 + scrub_sector_put(sbio->sectors[i]); 1745 1766 1746 1767 bio_put(sbio->bio); 1747 1768 kfree(sbio); ··· 1765 1786 sblock->generation_error = 0; 1766 1787 sblock->checksum_error = 0; 1767 1788 1768 - WARN_ON(sblock->page_count < 1); 1769 - flags = sblock->pagev[0]->flags; 1789 + WARN_ON(sblock->sector_count < 1); 1790 + flags = sblock->sectors[0]->flags; 1770 1791 ret = 0; 1771 1792 if (flags & BTRFS_EXTENT_FLAG_DATA) 1772 1793 ret = scrub_checksum_data(sblock); ··· 1788 1809 struct btrfs_fs_info *fs_info = sctx->fs_info; 1789 1810 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1790 1811 u8 csum[BTRFS_CSUM_SIZE]; 1791 - struct scrub_page *spage; 1812 + struct scrub_sector *sector; 1792 1813 char *kaddr; 1793 1814 1794 - BUG_ON(sblock->page_count < 1); 1795 - spage = sblock->pagev[0]; 1796 - if (!spage->have_csum) 1815 + BUG_ON(sblock->sector_count < 1); 1816 + sector = sblock->sectors[0]; 1817 + if (!sector->have_csum) 1797 1818 return 0; 1798 1819 1799 - kaddr = page_address(spage->page); 1820 + kaddr = page_address(sector->page); 1800 1821 1801 1822 shash->tfm = fs_info->csum_shash; 1802 1823 crypto_shash_init(shash); 1803 1824 1804 1825 /* 1805 - * In scrub_pages() and scrub_pages_for_parity() we ensure each spage 1826 + * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector 1806 1827 * only contains one sector of data. 1807 1828 */ 1808 1829 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); 1809 1830 1810 - if (memcmp(csum, spage->csum, fs_info->csum_size)) 1831 + if (memcmp(csum, sector->csum, fs_info->csum_size)) 1811 1832 sblock->checksum_error = 1; 1812 1833 return sblock->checksum_error; 1813 1834 } ··· 1828 1849 const u32 sectorsize = sctx->fs_info->sectorsize; 1829 1850 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; 1830 1851 int i; 1831 - struct scrub_page *spage; 1852 + struct scrub_sector *sector; 1832 1853 char *kaddr; 1833 1854 1834 - BUG_ON(sblock->page_count < 1); 1855 + BUG_ON(sblock->sector_count < 1); 1835 1856 1836 - /* Each member in pagev is just one block, not a full page */ 1837 - ASSERT(sblock->page_count == num_sectors); 1857 + /* Each member in sectors is just one sector */ 1858 + ASSERT(sblock->sector_count == num_sectors); 1838 1859 1839 - spage = sblock->pagev[0]; 1840 - kaddr = page_address(spage->page); 1860 + sector = sblock->sectors[0]; 1861 + kaddr = page_address(sector->page); 1841 1862 h = (struct btrfs_header *)kaddr; 1842 1863 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); 1843 1864 ··· 1846 1867 * a) don't have an extent buffer and 1847 1868 * b) the page is already kmapped 1848 1869 */ 1849 - if (spage->logical != btrfs_stack_header_bytenr(h)) 1870 + if (sector->logical != btrfs_stack_header_bytenr(h)) 1850 1871 sblock->header_error = 1; 1851 1872 1852 - if (spage->generation != btrfs_stack_header_generation(h)) { 1873 + if (sector->generation != btrfs_stack_header_generation(h)) { 1853 1874 sblock->header_error = 1; 1854 1875 sblock->generation_error = 1; 1855 1876 } 1856 1877 1857 - if (!scrub_check_fsid(h->fsid, spage)) 1878 + if (!scrub_check_fsid(h->fsid, sector)) 1858 1879 sblock->header_error = 1; 1859 1880 1860 1881 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, ··· 1867 1888 sectorsize - BTRFS_CSUM_SIZE); 1868 1889 1869 1890 for (i = 1; i < num_sectors; i++) { 1870 - kaddr = page_address(sblock->pagev[i]->page); 1891 + kaddr = page_address(sblock->sectors[i]->page); 1871 1892 crypto_shash_update(shash, kaddr, sectorsize); 1872 1893 } 1873 1894 ··· 1885 1906 struct btrfs_fs_info *fs_info = sctx->fs_info; 1886 1907 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1887 1908 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1888 - struct scrub_page *spage; 1909 + struct scrub_sector *sector; 1889 1910 char *kaddr; 1890 1911 int fail_gen = 0; 1891 1912 int fail_cor = 0; 1892 1913 1893 - BUG_ON(sblock->page_count < 1); 1894 - spage = sblock->pagev[0]; 1895 - kaddr = page_address(spage->page); 1914 + BUG_ON(sblock->sector_count < 1); 1915 + sector = sblock->sectors[0]; 1916 + kaddr = page_address(sector->page); 1896 1917 s = (struct btrfs_super_block *)kaddr; 1897 1918 1898 - if (spage->logical != btrfs_super_bytenr(s)) 1919 + if (sector->logical != btrfs_super_bytenr(s)) 1899 1920 ++fail_cor; 1900 1921 1901 - if (spage->generation != btrfs_super_generation(s)) 1922 + if (sector->generation != btrfs_super_generation(s)) 1902 1923 ++fail_gen; 1903 1924 1904 - if (!scrub_check_fsid(s->fsid, spage)) 1925 + if (!scrub_check_fsid(s->fsid, sector)) 1905 1926 ++fail_cor; 1906 1927 1907 1928 shash->tfm = fs_info->csum_shash; ··· 1922 1943 ++sctx->stat.super_errors; 1923 1944 spin_unlock(&sctx->stat_lock); 1924 1945 if (fail_cor) 1925 - btrfs_dev_stat_inc_and_print(spage->dev, 1946 + btrfs_dev_stat_inc_and_print(sector->dev, 1926 1947 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1927 1948 else 1928 - btrfs_dev_stat_inc_and_print(spage->dev, 1949 + btrfs_dev_stat_inc_and_print(sector->dev, 1929 1950 BTRFS_DEV_STAT_GENERATION_ERRS); 1930 1951 } 1931 1952 ··· 1945 1966 if (sblock->sparity) 1946 1967 scrub_parity_put(sblock->sparity); 1947 1968 1948 - for (i = 0; i < sblock->page_count; i++) 1949 - scrub_page_put(sblock->pagev[i]); 1969 + for (i = 0; i < sblock->sector_count; i++) 1970 + scrub_sector_put(sblock->sectors[i]); 1950 1971 kfree(sblock); 1951 1972 } 1952 1973 } 1953 1974 1954 - static void scrub_page_get(struct scrub_page *spage) 1975 + static void scrub_sector_get(struct scrub_sector *sector) 1955 1976 { 1956 - atomic_inc(&spage->refs); 1977 + atomic_inc(&sector->refs); 1957 1978 } 1958 1979 1959 - static void scrub_page_put(struct scrub_page *spage) 1980 + static void scrub_sector_put(struct scrub_sector *sector) 1960 1981 { 1961 - if (atomic_dec_and_test(&spage->refs)) { 1962 - if (spage->page) 1963 - __free_page(spage->page); 1964 - kfree(spage); 1982 + if (atomic_dec_and_test(&sector->refs)) { 1983 + if (sector->page) 1984 + __free_page(sector->page); 1985 + kfree(sector); 1965 1986 } 1966 1987 } 1967 1988 ··· 2036 2057 sbio = sctx->bios[sctx->curr]; 2037 2058 sctx->curr = -1; 2038 2059 scrub_pending_bio_inc(sctx); 2039 - btrfsic_submit_bio(sbio->bio); 2060 + btrfsic_check_bio(sbio->bio); 2061 + submit_bio(sbio->bio); 2040 2062 } 2041 2063 2042 - static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 2043 - struct scrub_page *spage) 2064 + static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, 2065 + struct scrub_sector *sector) 2044 2066 { 2045 - struct scrub_block *sblock = spage->sblock; 2067 + struct scrub_block *sblock = sector->sblock; 2046 2068 struct scrub_bio *sbio; 2047 2069 const u32 sectorsize = sctx->fs_info->sectorsize; 2048 2070 int ret; ··· 2058 2078 if (sctx->curr != -1) { 2059 2079 sctx->first_free = sctx->bios[sctx->curr]->next_free; 2060 2080 sctx->bios[sctx->curr]->next_free = -1; 2061 - sctx->bios[sctx->curr]->page_count = 0; 2081 + sctx->bios[sctx->curr]->sector_count = 0; 2062 2082 spin_unlock(&sctx->list_lock); 2063 2083 } else { 2064 2084 spin_unlock(&sctx->list_lock); ··· 2066 2086 } 2067 2087 } 2068 2088 sbio = sctx->bios[sctx->curr]; 2069 - if (sbio->page_count == 0) { 2070 - struct bio *bio; 2071 - 2072 - sbio->physical = spage->physical; 2073 - sbio->logical = spage->logical; 2074 - sbio->dev = spage->dev; 2075 - bio = sbio->bio; 2076 - if (!bio) { 2077 - bio = btrfs_bio_alloc(sctx->pages_per_bio); 2078 - sbio->bio = bio; 2089 + if (sbio->sector_count == 0) { 2090 + sbio->physical = sector->physical; 2091 + sbio->logical = sector->logical; 2092 + sbio->dev = sector->dev; 2093 + if (!sbio->bio) { 2094 + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, 2095 + REQ_OP_READ, GFP_NOFS); 2079 2096 } 2080 - 2081 - bio->bi_private = sbio; 2082 - bio->bi_end_io = scrub_bio_end_io; 2083 - bio_set_dev(bio, sbio->dev->bdev); 2084 - bio->bi_iter.bi_sector = sbio->physical >> 9; 2085 - bio->bi_opf = REQ_OP_READ; 2097 + sbio->bio->bi_private = sbio; 2098 + sbio->bio->bi_end_io = scrub_bio_end_io; 2099 + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; 2086 2100 sbio->status = 0; 2087 - } else if (sbio->physical + sbio->page_count * sectorsize != 2088 - spage->physical || 2089 - sbio->logical + sbio->page_count * sectorsize != 2090 - spage->logical || 2091 - sbio->dev != spage->dev) { 2101 + } else if (sbio->physical + sbio->sector_count * sectorsize != 2102 + sector->physical || 2103 + sbio->logical + sbio->sector_count * sectorsize != 2104 + sector->logical || 2105 + sbio->dev != sector->dev) { 2092 2106 scrub_submit(sctx); 2093 2107 goto again; 2094 2108 } 2095 2109 2096 - sbio->pagev[sbio->page_count] = spage; 2097 - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); 2110 + sbio->sectors[sbio->sector_count] = sector; 2111 + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); 2098 2112 if (ret != sectorsize) { 2099 - if (sbio->page_count < 1) { 2113 + if (sbio->sector_count < 1) { 2100 2114 bio_put(sbio->bio); 2101 2115 sbio->bio = NULL; 2102 2116 return -EIO; ··· 2100 2126 } 2101 2127 2102 2128 scrub_block_get(sblock); /* one for the page added to the bio */ 2103 - atomic_inc(&sblock->outstanding_pages); 2104 - sbio->page_count++; 2105 - if (sbio->page_count == sctx->pages_per_bio) 2129 + atomic_inc(&sblock->outstanding_sectors); 2130 + sbio->sector_count++; 2131 + if (sbio->sector_count == sctx->sectors_per_bio) 2106 2132 scrub_submit(sctx); 2107 2133 2108 2134 return 0; ··· 2118 2144 2119 2145 bio_put(bio); 2120 2146 2121 - btrfs_queue_work(fs_info->scrub_workers, &sblock->work); 2147 + queue_work(fs_info->scrub_workers, &sblock->work); 2122 2148 } 2123 2149 2124 - static void scrub_missing_raid56_worker(struct btrfs_work *work) 2150 + static void scrub_missing_raid56_worker(struct work_struct *work) 2125 2151 { 2126 2152 struct scrub_block *sblock = container_of(work, struct scrub_block, work); 2127 2153 struct scrub_ctx *sctx = sblock->sctx; ··· 2129 2155 u64 logical; 2130 2156 struct btrfs_device *dev; 2131 2157 2132 - logical = sblock->pagev[0]->logical; 2133 - dev = sblock->pagev[0]->dev; 2158 + logical = sblock->sectors[0]->logical; 2159 + dev = sblock->sectors[0]->dev; 2134 2160 2135 2161 if (sblock->no_io_error_seen) 2136 2162 scrub_recheck_block_checksum(sblock); ··· 2167 2193 { 2168 2194 struct scrub_ctx *sctx = sblock->sctx; 2169 2195 struct btrfs_fs_info *fs_info = sctx->fs_info; 2170 - u64 length = sblock->page_count * PAGE_SIZE; 2171 - u64 logical = sblock->pagev[0]->logical; 2196 + u64 length = sblock->sector_count << fs_info->sectorsize_bits; 2197 + u64 logical = sblock->sectors[0]->logical; 2172 2198 struct btrfs_io_context *bioc = NULL; 2173 2199 struct bio *bio; 2174 2200 struct btrfs_raid_bio *rbio; ··· 2187 2213 * We shouldn't be scrubbing a missing device. Even for dev 2188 2214 * replace, we should only get here for RAID 5/6. We either 2189 2215 * managed to mount something with no mirrors remaining or 2190 - * there's a bug in scrub_remap_extent()/btrfs_map_block(). 2216 + * there's a bug in scrub_find_good_copy()/btrfs_map_block(). 2191 2217 */ 2192 2218 goto bioc_out; 2193 2219 } 2194 2220 2195 - bio = btrfs_bio_alloc(BIO_MAX_VECS); 2221 + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); 2196 2222 bio->bi_iter.bi_sector = logical >> 9; 2197 2223 bio->bi_private = sblock; 2198 2224 bio->bi_end_io = scrub_missing_raid56_end_io; ··· 2201 2227 if (!rbio) 2202 2228 goto rbio_out; 2203 2229 2204 - for (i = 0; i < sblock->page_count; i++) { 2205 - struct scrub_page *spage = sblock->pagev[i]; 2230 + for (i = 0; i < sblock->sector_count; i++) { 2231 + struct scrub_sector *sector = sblock->sectors[i]; 2206 2232 2207 - raid56_add_scrub_pages(rbio, spage->page, spage->logical); 2233 + /* 2234 + * For now, our scrub is still one page per sector, so pgoff 2235 + * is always 0. 2236 + */ 2237 + raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); 2208 2238 } 2209 2239 2210 - btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); 2240 + INIT_WORK(&sblock->work, scrub_missing_raid56_worker); 2211 2241 scrub_block_get(sblock); 2212 2242 scrub_pending_bio_inc(sctx); 2213 2243 raid56_submit_missing_rbio(rbio); ··· 2227 2249 spin_unlock(&sctx->stat_lock); 2228 2250 } 2229 2251 2230 - static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, 2252 + static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, 2231 2253 u64 physical, struct btrfs_device *dev, u64 flags, 2232 2254 u64 gen, int mirror_num, u8 *csum, 2233 2255 u64 physical_for_dev_replace) ··· 2251 2273 sblock->no_io_error_seen = 1; 2252 2274 2253 2275 for (index = 0; len > 0; index++) { 2254 - struct scrub_page *spage; 2276 + struct scrub_sector *sector; 2255 2277 /* 2256 2278 * Here we will allocate one page for one sector to scrub. 2257 2279 * This is fine if PAGE_SIZE == sectorsize, but will cost ··· 2259 2281 */ 2260 2282 u32 l = min(sectorsize, len); 2261 2283 2262 - spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2263 - if (!spage) { 2284 + sector = kzalloc(sizeof(*sector), GFP_KERNEL); 2285 + if (!sector) { 2264 2286 leave_nomem: 2265 2287 spin_lock(&sctx->stat_lock); 2266 2288 sctx->stat.malloc_errors++; ··· 2268 2290 scrub_block_put(sblock); 2269 2291 return -ENOMEM; 2270 2292 } 2271 - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); 2272 - scrub_page_get(spage); 2273 - sblock->pagev[index] = spage; 2274 - spage->sblock = sblock; 2275 - spage->dev = dev; 2276 - spage->flags = flags; 2277 - spage->generation = gen; 2278 - spage->logical = logical; 2279 - spage->physical = physical; 2280 - spage->physical_for_dev_replace = physical_for_dev_replace; 2281 - spage->mirror_num = mirror_num; 2293 + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); 2294 + scrub_sector_get(sector); 2295 + sblock->sectors[index] = sector; 2296 + sector->sblock = sblock; 2297 + sector->dev = dev; 2298 + sector->flags = flags; 2299 + sector->generation = gen; 2300 + sector->logical = logical; 2301 + sector->physical = physical; 2302 + sector->physical_for_dev_replace = physical_for_dev_replace; 2303 + sector->mirror_num = mirror_num; 2282 2304 if (csum) { 2283 - spage->have_csum = 1; 2284 - memcpy(spage->csum, csum, sctx->fs_info->csum_size); 2305 + sector->have_csum = 1; 2306 + memcpy(sector->csum, csum, sctx->fs_info->csum_size); 2285 2307 } else { 2286 - spage->have_csum = 0; 2308 + sector->have_csum = 0; 2287 2309 } 2288 - sblock->page_count++; 2289 - spage->page = alloc_page(GFP_KERNEL); 2290 - if (!spage->page) 2310 + sblock->sector_count++; 2311 + sector->page = alloc_page(GFP_KERNEL); 2312 + if (!sector->page) 2291 2313 goto leave_nomem; 2292 2314 len -= l; 2293 2315 logical += l; ··· 2295 2317 physical_for_dev_replace += l; 2296 2318 } 2297 2319 2298 - WARN_ON(sblock->page_count == 0); 2320 + WARN_ON(sblock->sector_count == 0); 2299 2321 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2300 2322 /* 2301 2323 * This case should only be hit for RAID 5/6 device replace. See ··· 2303 2325 */ 2304 2326 scrub_missing_raid56_pages(sblock); 2305 2327 } else { 2306 - for (index = 0; index < sblock->page_count; index++) { 2307 - struct scrub_page *spage = sblock->pagev[index]; 2328 + for (index = 0; index < sblock->sector_count; index++) { 2329 + struct scrub_sector *sector = sblock->sectors[index]; 2308 2330 int ret; 2309 2331 2310 - ret = scrub_add_page_to_rd_bio(sctx, spage); 2332 + ret = scrub_add_sector_to_rd_bio(sctx, sector); 2311 2333 if (ret) { 2312 2334 scrub_block_put(sblock); 2313 2335 return ret; ··· 2331 2353 sbio->status = bio->bi_status; 2332 2354 sbio->bio = bio; 2333 2355 2334 - btrfs_queue_work(fs_info->scrub_workers, &sbio->work); 2356 + queue_work(fs_info->scrub_workers, &sbio->work); 2335 2357 } 2336 2358 2337 - static void scrub_bio_end_io_worker(struct btrfs_work *work) 2359 + static void scrub_bio_end_io_worker(struct work_struct *work) 2338 2360 { 2339 2361 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2340 2362 struct scrub_ctx *sctx = sbio->sctx; 2341 2363 int i; 2342 2364 2343 - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); 2365 + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); 2344 2366 if (sbio->status) { 2345 - for (i = 0; i < sbio->page_count; i++) { 2346 - struct scrub_page *spage = sbio->pagev[i]; 2367 + for (i = 0; i < sbio->sector_count; i++) { 2368 + struct scrub_sector *sector = sbio->sectors[i]; 2347 2369 2348 - spage->io_error = 1; 2349 - spage->sblock->no_io_error_seen = 0; 2370 + sector->io_error = 1; 2371 + sector->sblock->no_io_error_seen = 0; 2350 2372 } 2351 2373 } 2352 2374 2353 - /* now complete the scrub_block items that have all pages completed */ 2354 - for (i = 0; i < sbio->page_count; i++) { 2355 - struct scrub_page *spage = sbio->pagev[i]; 2356 - struct scrub_block *sblock = spage->sblock; 2375 + /* Now complete the scrub_block items that have all pages completed */ 2376 + for (i = 0; i < sbio->sector_count; i++) { 2377 + struct scrub_sector *sector = sbio->sectors[i]; 2378 + struct scrub_block *sblock = sector->sblock; 2357 2379 2358 - if (atomic_dec_and_test(&sblock->outstanding_pages)) 2380 + if (atomic_dec_and_test(&sblock->outstanding_sectors)) 2359 2381 scrub_block_complete(sblock); 2360 2382 scrub_block_put(sblock); 2361 2383 } ··· 2434 2456 } 2435 2457 2436 2458 if (sblock->sparity && corrupted && !sblock->data_corrected) { 2437 - u64 start = sblock->pagev[0]->logical; 2438 - u64 end = sblock->pagev[sblock->page_count - 1]->logical + 2459 + u64 start = sblock->sectors[0]->logical; 2460 + u64 end = sblock->sectors[sblock->sector_count - 1]->logical + 2439 2461 sblock->sctx->fs_info->sectorsize; 2440 2462 2441 2463 ASSERT(end - start <= U32_MAX); ··· 2510 2532 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, 2511 2533 u64 logical, u32 len, 2512 2534 u64 physical, struct btrfs_device *dev, u64 flags, 2513 - u64 gen, int mirror_num, u64 physical_for_dev_replace) 2535 + u64 gen, int mirror_num) 2514 2536 { 2537 + struct btrfs_device *src_dev = dev; 2538 + u64 src_physical = physical; 2539 + int src_mirror = mirror_num; 2515 2540 int ret; 2516 2541 u8 csum[BTRFS_CSUM_SIZE]; 2517 2542 u32 blocksize; ··· 2542 2561 WARN_ON(1); 2543 2562 } 2544 2563 2564 + /* 2565 + * For dev-replace case, we can have @dev being a missing device. 2566 + * Regular scrub will avoid its execution on missing device at all, 2567 + * as that would trigger tons of read error. 2568 + * 2569 + * Reading from missing device will cause read error counts to 2570 + * increase unnecessarily. 2571 + * So here we change the read source to a good mirror. 2572 + */ 2573 + if (sctx->is_dev_replace && !dev->bdev) 2574 + scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, 2575 + &src_dev, &src_mirror); 2545 2576 while (len) { 2546 2577 u32 l = min(len, blocksize); 2547 2578 int have_csum = 0; ··· 2564 2571 if (have_csum == 0) 2565 2572 ++sctx->stat.no_csum; 2566 2573 } 2567 - ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2568 - mirror_num, have_csum ? csum : NULL, 2569 - physical_for_dev_replace); 2574 + ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, 2575 + flags, gen, src_mirror, 2576 + have_csum ? csum : NULL, physical); 2570 2577 if (ret) 2571 2578 return ret; 2572 2579 len -= l; 2573 2580 logical += l; 2574 2581 physical += l; 2575 - physical_for_dev_replace += l; 2582 + src_physical += l; 2576 2583 } 2577 2584 return 0; 2578 2585 } 2579 2586 2580 - static int scrub_pages_for_parity(struct scrub_parity *sparity, 2587 + static int scrub_sectors_for_parity(struct scrub_parity *sparity, 2581 2588 u64 logical, u32 len, 2582 2589 u64 physical, struct btrfs_device *dev, 2583 2590 u64 flags, u64 gen, int mirror_num, u8 *csum) ··· 2606 2613 scrub_parity_get(sparity); 2607 2614 2608 2615 for (index = 0; len > 0; index++) { 2609 - struct scrub_page *spage; 2616 + struct scrub_sector *sector; 2610 2617 2611 - spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2612 - if (!spage) { 2618 + sector = kzalloc(sizeof(*sector), GFP_KERNEL); 2619 + if (!sector) { 2613 2620 leave_nomem: 2614 2621 spin_lock(&sctx->stat_lock); 2615 2622 sctx->stat.malloc_errors++; ··· 2617 2624 scrub_block_put(sblock); 2618 2625 return -ENOMEM; 2619 2626 } 2620 - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); 2627 + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); 2621 2628 /* For scrub block */ 2622 - scrub_page_get(spage); 2623 - sblock->pagev[index] = spage; 2629 + scrub_sector_get(sector); 2630 + sblock->sectors[index] = sector; 2624 2631 /* For scrub parity */ 2625 - scrub_page_get(spage); 2626 - list_add_tail(&spage->list, &sparity->spages); 2627 - spage->sblock = sblock; 2628 - spage->dev = dev; 2629 - spage->flags = flags; 2630 - spage->generation = gen; 2631 - spage->logical = logical; 2632 - spage->physical = physical; 2633 - spage->mirror_num = mirror_num; 2632 + scrub_sector_get(sector); 2633 + list_add_tail(&sector->list, &sparity->sectors_list); 2634 + sector->sblock = sblock; 2635 + sector->dev = dev; 2636 + sector->flags = flags; 2637 + sector->generation = gen; 2638 + sector->logical = logical; 2639 + sector->physical = physical; 2640 + sector->mirror_num = mirror_num; 2634 2641 if (csum) { 2635 - spage->have_csum = 1; 2636 - memcpy(spage->csum, csum, sctx->fs_info->csum_size); 2642 + sector->have_csum = 1; 2643 + memcpy(sector->csum, csum, sctx->fs_info->csum_size); 2637 2644 } else { 2638 - spage->have_csum = 0; 2645 + sector->have_csum = 0; 2639 2646 } 2640 - sblock->page_count++; 2641 - spage->page = alloc_page(GFP_KERNEL); 2642 - if (!spage->page) 2647 + sblock->sector_count++; 2648 + sector->page = alloc_page(GFP_KERNEL); 2649 + if (!sector->page) 2643 2650 goto leave_nomem; 2644 2651 2645 2652 ··· 2649 2656 physical += sectorsize; 2650 2657 } 2651 2658 2652 - WARN_ON(sblock->page_count == 0); 2653 - for (index = 0; index < sblock->page_count; index++) { 2654 - struct scrub_page *spage = sblock->pagev[index]; 2659 + WARN_ON(sblock->sector_count == 0); 2660 + for (index = 0; index < sblock->sector_count; index++) { 2661 + struct scrub_sector *sector = sblock->sectors[index]; 2655 2662 int ret; 2656 2663 2657 - ret = scrub_add_page_to_rd_bio(sctx, spage); 2664 + ret = scrub_add_sector_to_rd_bio(sctx, sector); 2658 2665 if (ret) { 2659 2666 scrub_block_put(sblock); 2660 2667 return ret; 2661 2668 } 2662 2669 } 2663 2670 2664 - /* last one frees, either here or in bio completion for last page */ 2671 + /* Last one frees, either here or in bio completion for last sector */ 2665 2672 scrub_block_put(sblock); 2666 2673 return 0; 2667 2674 } ··· 2700 2707 if (have_csum == 0) 2701 2708 goto skip; 2702 2709 } 2703 - ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, 2710 + ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, 2704 2711 flags, gen, mirror_num, 2705 2712 have_csum ? csum : NULL); 2706 2713 if (ret) ··· 2760 2767 static void scrub_free_parity(struct scrub_parity *sparity) 2761 2768 { 2762 2769 struct scrub_ctx *sctx = sparity->sctx; 2763 - struct scrub_page *curr, *next; 2770 + struct scrub_sector *curr, *next; 2764 2771 int nbits; 2765 2772 2766 2773 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); ··· 2771 2778 spin_unlock(&sctx->stat_lock); 2772 2779 } 2773 2780 2774 - list_for_each_entry_safe(curr, next, &sparity->spages, list) { 2781 + list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { 2775 2782 list_del_init(&curr->list); 2776 - scrub_page_put(curr); 2783 + scrub_sector_put(curr); 2777 2784 } 2778 2785 2779 2786 kfree(sparity); 2780 2787 } 2781 2788 2782 - static void scrub_parity_bio_endio_worker(struct btrfs_work *work) 2789 + static void scrub_parity_bio_endio_worker(struct work_struct *work) 2783 2790 { 2784 2791 struct scrub_parity *sparity = container_of(work, struct scrub_parity, 2785 2792 work); ··· 2791 2798 2792 2799 static void scrub_parity_bio_endio(struct bio *bio) 2793 2800 { 2794 - struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; 2801 + struct scrub_parity *sparity = bio->bi_private; 2795 2802 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; 2796 2803 2797 2804 if (bio->bi_status) ··· 2800 2807 2801 2808 bio_put(bio); 2802 2809 2803 - btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, 2804 - NULL); 2805 - btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); 2810 + INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); 2811 + queue_work(fs_info->scrub_parity_workers, &sparity->work); 2806 2812 } 2807 2813 2808 2814 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) ··· 2826 2834 if (ret || !bioc || !bioc->raid_map) 2827 2835 goto bioc_out; 2828 2836 2829 - bio = btrfs_bio_alloc(BIO_MAX_VECS); 2837 + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); 2830 2838 bio->bi_iter.bi_sector = sparity->logic_start >> 9; 2831 2839 bio->bi_private = sparity; 2832 2840 bio->bi_end_io = scrub_parity_bio_endio; ··· 2874 2882 scrub_parity_check_and_repair(sparity); 2875 2883 } 2876 2884 2885 + /* 2886 + * Return 0 if the extent item range covers any byte of the range. 2887 + * Return <0 if the extent item is before @search_start. 2888 + * Return >0 if the extent item is after @start_start + @search_len. 2889 + */ 2890 + static int compare_extent_item_range(struct btrfs_path *path, 2891 + u64 search_start, u64 search_len) 2892 + { 2893 + struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 2894 + u64 len; 2895 + struct btrfs_key key; 2896 + 2897 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2898 + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 2899 + key.type == BTRFS_METADATA_ITEM_KEY); 2900 + if (key.type == BTRFS_METADATA_ITEM_KEY) 2901 + len = fs_info->nodesize; 2902 + else 2903 + len = key.offset; 2904 + 2905 + if (key.objectid + len <= search_start) 2906 + return -1; 2907 + if (key.objectid >= search_start + search_len) 2908 + return 1; 2909 + return 0; 2910 + } 2911 + 2912 + /* 2913 + * Locate one extent item which covers any byte in range 2914 + * [@search_start, @search_start + @search_length) 2915 + * 2916 + * If the path is not initialized, we will initialize the search by doing 2917 + * a btrfs_search_slot(). 2918 + * If the path is already initialized, we will use the path as the initial 2919 + * slot, to avoid duplicated btrfs_search_slot() calls. 2920 + * 2921 + * NOTE: If an extent item starts before @search_start, we will still 2922 + * return the extent item. This is for data extent crossing stripe boundary. 2923 + * 2924 + * Return 0 if we found such extent item, and @path will point to the extent item. 2925 + * Return >0 if no such extent item can be found, and @path will be released. 2926 + * Return <0 if hit fatal error, and @path will be released. 2927 + */ 2928 + static int find_first_extent_item(struct btrfs_root *extent_root, 2929 + struct btrfs_path *path, 2930 + u64 search_start, u64 search_len) 2931 + { 2932 + struct btrfs_fs_info *fs_info = extent_root->fs_info; 2933 + struct btrfs_key key; 2934 + int ret; 2935 + 2936 + /* Continue using the existing path */ 2937 + if (path->nodes[0]) 2938 + goto search_forward; 2939 + 2940 + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2941 + key.type = BTRFS_METADATA_ITEM_KEY; 2942 + else 2943 + key.type = BTRFS_EXTENT_ITEM_KEY; 2944 + key.objectid = search_start; 2945 + key.offset = (u64)-1; 2946 + 2947 + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2948 + if (ret < 0) 2949 + return ret; 2950 + 2951 + ASSERT(ret > 0); 2952 + /* 2953 + * Here we intentionally pass 0 as @min_objectid, as there could be 2954 + * an extent item starting before @search_start. 2955 + */ 2956 + ret = btrfs_previous_extent_item(extent_root, path, 0); 2957 + if (ret < 0) 2958 + return ret; 2959 + /* 2960 + * No matter whether we have found an extent item, the next loop will 2961 + * properly do every check on the key. 2962 + */ 2963 + search_forward: 2964 + while (true) { 2965 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2966 + if (key.objectid >= search_start + search_len) 2967 + break; 2968 + if (key.type != BTRFS_METADATA_ITEM_KEY && 2969 + key.type != BTRFS_EXTENT_ITEM_KEY) 2970 + goto next; 2971 + 2972 + ret = compare_extent_item_range(path, search_start, search_len); 2973 + if (ret == 0) 2974 + return ret; 2975 + if (ret > 0) 2976 + break; 2977 + next: 2978 + path->slots[0]++; 2979 + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2980 + ret = btrfs_next_leaf(extent_root, path); 2981 + if (ret) { 2982 + /* Either no more item or fatal error */ 2983 + btrfs_release_path(path); 2984 + return ret; 2985 + } 2986 + } 2987 + } 2988 + btrfs_release_path(path); 2989 + return 1; 2990 + } 2991 + 2992 + static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 2993 + u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 2994 + { 2995 + struct btrfs_key key; 2996 + struct btrfs_extent_item *ei; 2997 + 2998 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2999 + ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 3000 + key.type == BTRFS_EXTENT_ITEM_KEY); 3001 + *extent_start_ret = key.objectid; 3002 + if (key.type == BTRFS_METADATA_ITEM_KEY) 3003 + *size_ret = path->nodes[0]->fs_info->nodesize; 3004 + else 3005 + *size_ret = key.offset; 3006 + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 3007 + *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 3008 + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 3009 + } 3010 + 3011 + static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, 3012 + u64 boundary_start, u64 boudary_len) 3013 + { 3014 + return (extent_start < boundary_start && 3015 + extent_start + extent_len > boundary_start) || 3016 + (extent_start < boundary_start + boudary_len && 3017 + extent_start + extent_len > boundary_start + boudary_len); 3018 + } 3019 + 3020 + static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, 3021 + struct scrub_parity *sparity, 3022 + struct map_lookup *map, 3023 + struct btrfs_device *sdev, 3024 + struct btrfs_path *path, 3025 + u64 logical) 3026 + { 3027 + struct btrfs_fs_info *fs_info = sctx->fs_info; 3028 + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); 3029 + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); 3030 + u64 cur_logical = logical; 3031 + int ret; 3032 + 3033 + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 3034 + 3035 + /* Path must not be populated */ 3036 + ASSERT(!path->nodes[0]); 3037 + 3038 + while (cur_logical < logical + map->stripe_len) { 3039 + struct btrfs_io_context *bioc = NULL; 3040 + struct btrfs_device *extent_dev; 3041 + u64 extent_start; 3042 + u64 extent_size; 3043 + u64 mapped_length; 3044 + u64 extent_flags; 3045 + u64 extent_gen; 3046 + u64 extent_physical; 3047 + u64 extent_mirror_num; 3048 + 3049 + ret = find_first_extent_item(extent_root, path, cur_logical, 3050 + logical + map->stripe_len - cur_logical); 3051 + /* No more extent item in this data stripe */ 3052 + if (ret > 0) { 3053 + ret = 0; 3054 + break; 3055 + } 3056 + if (ret < 0) 3057 + break; 3058 + get_extent_info(path, &extent_start, &extent_size, &extent_flags, 3059 + &extent_gen); 3060 + 3061 + /* Metadata should not cross stripe boundaries */ 3062 + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3063 + does_range_cross_boundary(extent_start, extent_size, 3064 + logical, map->stripe_len)) { 3065 + btrfs_err(fs_info, 3066 + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3067 + extent_start, logical); 3068 + spin_lock(&sctx->stat_lock); 3069 + sctx->stat.uncorrectable_errors++; 3070 + spin_unlock(&sctx->stat_lock); 3071 + cur_logical += extent_size; 3072 + continue; 3073 + } 3074 + 3075 + /* Skip hole range which doesn't have any extent */ 3076 + cur_logical = max(extent_start, cur_logical); 3077 + 3078 + /* Truncate the range inside this data stripe */ 3079 + extent_size = min(extent_start + extent_size, 3080 + logical + map->stripe_len) - cur_logical; 3081 + extent_start = cur_logical; 3082 + ASSERT(extent_size <= U32_MAX); 3083 + 3084 + scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); 3085 + 3086 + mapped_length = extent_size; 3087 + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, 3088 + &mapped_length, &bioc, 0); 3089 + if (!ret && (!bioc || mapped_length < extent_size)) 3090 + ret = -EIO; 3091 + if (ret) { 3092 + btrfs_put_bioc(bioc); 3093 + scrub_parity_mark_sectors_error(sparity, extent_start, 3094 + extent_size); 3095 + break; 3096 + } 3097 + extent_physical = bioc->stripes[0].physical; 3098 + extent_mirror_num = bioc->mirror_num; 3099 + extent_dev = bioc->stripes[0].dev; 3100 + btrfs_put_bioc(bioc); 3101 + 3102 + ret = btrfs_lookup_csums_range(csum_root, extent_start, 3103 + extent_start + extent_size - 1, 3104 + &sctx->csum_list, 1); 3105 + if (ret) { 3106 + scrub_parity_mark_sectors_error(sparity, extent_start, 3107 + extent_size); 3108 + break; 3109 + } 3110 + 3111 + ret = scrub_extent_for_parity(sparity, extent_start, 3112 + extent_size, extent_physical, 3113 + extent_dev, extent_flags, 3114 + extent_gen, extent_mirror_num); 3115 + scrub_free_csums(sctx); 3116 + 3117 + if (ret) { 3118 + scrub_parity_mark_sectors_error(sparity, extent_start, 3119 + extent_size); 3120 + break; 3121 + } 3122 + 3123 + cond_resched(); 3124 + cur_logical += extent_size; 3125 + } 3126 + btrfs_release_path(path); 3127 + return ret; 3128 + } 3129 + 2877 3130 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, 2878 3131 struct map_lookup *map, 2879 3132 struct btrfs_device *sdev, ··· 3126 2889 u64 logic_end) 3127 2890 { 3128 2891 struct btrfs_fs_info *fs_info = sctx->fs_info; 3129 - struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); 3130 - struct btrfs_root *csum_root; 3131 - struct btrfs_extent_item *extent; 3132 - struct btrfs_io_context *bioc = NULL; 3133 2892 struct btrfs_path *path; 3134 - u64 flags; 2893 + u64 cur_logical; 3135 2894 int ret; 3136 - int slot; 3137 - struct extent_buffer *l; 3138 - struct btrfs_key key; 3139 - u64 generation; 3140 - u64 extent_logical; 3141 - u64 extent_physical; 3142 - /* Check the comment in scrub_stripe() for why u32 is enough here */ 3143 - u32 extent_len; 3144 - u64 mapped_length; 3145 - struct btrfs_device *extent_dev; 3146 2895 struct scrub_parity *sparity; 3147 2896 int nsectors; 3148 2897 int bitmap_len; 3149 - int extent_mirror_num; 3150 - int stop_loop = 0; 3151 2898 3152 2899 path = btrfs_alloc_path(); 3153 2900 if (!path) { ··· 3164 2943 sparity->logic_start = logic_start; 3165 2944 sparity->logic_end = logic_end; 3166 2945 refcount_set(&sparity->refs, 1); 3167 - INIT_LIST_HEAD(&sparity->spages); 2946 + INIT_LIST_HEAD(&sparity->sectors_list); 3168 2947 sparity->dbitmap = sparity->bitmap; 3169 2948 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; 3170 2949 3171 2950 ret = 0; 3172 - while (logic_start < logic_end) { 3173 - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3174 - key.type = BTRFS_METADATA_ITEM_KEY; 3175 - else 3176 - key.type = BTRFS_EXTENT_ITEM_KEY; 3177 - key.objectid = logic_start; 3178 - key.offset = (u64)-1; 3179 - 3180 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2951 + for (cur_logical = logic_start; cur_logical < logic_end; 2952 + cur_logical += map->stripe_len) { 2953 + ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, 2954 + sdev, path, cur_logical); 3181 2955 if (ret < 0) 3182 - goto out; 3183 - 3184 - if (ret > 0) { 3185 - ret = btrfs_previous_extent_item(root, path, 0); 3186 - if (ret < 0) 3187 - goto out; 3188 - if (ret > 0) { 3189 - btrfs_release_path(path); 3190 - ret = btrfs_search_slot(NULL, root, &key, 3191 - path, 0, 0); 3192 - if (ret < 0) 3193 - goto out; 3194 - } 3195 - } 3196 - 3197 - stop_loop = 0; 3198 - while (1) { 3199 - u64 bytes; 3200 - 3201 - l = path->nodes[0]; 3202 - slot = path->slots[0]; 3203 - if (slot >= btrfs_header_nritems(l)) { 3204 - ret = btrfs_next_leaf(root, path); 3205 - if (ret == 0) 3206 - continue; 3207 - if (ret < 0) 3208 - goto out; 3209 - 3210 - stop_loop = 1; 3211 - break; 3212 - } 3213 - btrfs_item_key_to_cpu(l, &key, slot); 3214 - 3215 - if (key.type != BTRFS_EXTENT_ITEM_KEY && 3216 - key.type != BTRFS_METADATA_ITEM_KEY) 3217 - goto next; 3218 - 3219 - if (key.type == BTRFS_METADATA_ITEM_KEY) 3220 - bytes = fs_info->nodesize; 3221 - else 3222 - bytes = key.offset; 3223 - 3224 - if (key.objectid + bytes <= logic_start) 3225 - goto next; 3226 - 3227 - if (key.objectid >= logic_end) { 3228 - stop_loop = 1; 3229 - break; 3230 - } 3231 - 3232 - while (key.objectid >= logic_start + map->stripe_len) 3233 - logic_start += map->stripe_len; 3234 - 3235 - extent = btrfs_item_ptr(l, slot, 3236 - struct btrfs_extent_item); 3237 - flags = btrfs_extent_flags(l, extent); 3238 - generation = btrfs_extent_generation(l, extent); 3239 - 3240 - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3241 - (key.objectid < logic_start || 3242 - key.objectid + bytes > 3243 - logic_start + map->stripe_len)) { 3244 - btrfs_err(fs_info, 3245 - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3246 - key.objectid, logic_start); 3247 - spin_lock(&sctx->stat_lock); 3248 - sctx->stat.uncorrectable_errors++; 3249 - spin_unlock(&sctx->stat_lock); 3250 - goto next; 3251 - } 3252 - again: 3253 - extent_logical = key.objectid; 3254 - ASSERT(bytes <= U32_MAX); 3255 - extent_len = bytes; 3256 - 3257 - if (extent_logical < logic_start) { 3258 - extent_len -= logic_start - extent_logical; 3259 - extent_logical = logic_start; 3260 - } 3261 - 3262 - if (extent_logical + extent_len > 3263 - logic_start + map->stripe_len) 3264 - extent_len = logic_start + map->stripe_len - 3265 - extent_logical; 3266 - 3267 - scrub_parity_mark_sectors_data(sparity, extent_logical, 3268 - extent_len); 3269 - 3270 - mapped_length = extent_len; 3271 - bioc = NULL; 3272 - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, 3273 - extent_logical, &mapped_length, &bioc, 3274 - 0); 3275 - if (!ret) { 3276 - if (!bioc || mapped_length < extent_len) 3277 - ret = -EIO; 3278 - } 3279 - if (ret) { 3280 - btrfs_put_bioc(bioc); 3281 - goto out; 3282 - } 3283 - extent_physical = bioc->stripes[0].physical; 3284 - extent_mirror_num = bioc->mirror_num; 3285 - extent_dev = bioc->stripes[0].dev; 3286 - btrfs_put_bioc(bioc); 3287 - 3288 - csum_root = btrfs_csum_root(fs_info, extent_logical); 3289 - ret = btrfs_lookup_csums_range(csum_root, 3290 - extent_logical, 3291 - extent_logical + extent_len - 1, 3292 - &sctx->csum_list, 1); 3293 - if (ret) 3294 - goto out; 3295 - 3296 - ret = scrub_extent_for_parity(sparity, extent_logical, 3297 - extent_len, 3298 - extent_physical, 3299 - extent_dev, flags, 3300 - generation, 3301 - extent_mirror_num); 3302 - 3303 - scrub_free_csums(sctx); 3304 - 3305 - if (ret) 3306 - goto out; 3307 - 3308 - if (extent_logical + extent_len < 3309 - key.objectid + bytes) { 3310 - logic_start += map->stripe_len; 3311 - 3312 - if (logic_start >= logic_end) { 3313 - stop_loop = 1; 3314 - break; 3315 - } 3316 - 3317 - if (logic_start < key.objectid + bytes) { 3318 - cond_resched(); 3319 - goto again; 3320 - } 3321 - } 3322 - next: 3323 - path->slots[0]++; 3324 - } 3325 - 3326 - btrfs_release_path(path); 3327 - 3328 - if (stop_loop) 3329 2956 break; 2957 + } 3330 2958 3331 - logic_start += map->stripe_len; 3332 - } 3333 - out: 3334 - if (ret < 0) { 3335 - ASSERT(logic_end - logic_start <= U32_MAX); 3336 - scrub_parity_mark_sectors_error(sparity, logic_start, 3337 - logic_end - logic_start); 3338 - } 3339 2959 scrub_parity_put(sparity); 3340 2960 scrub_submit(sctx); 3341 2961 mutex_lock(&sctx->wr_lock); ··· 3227 3165 return ret; 3228 3166 } 3229 3167 3168 + /* 3169 + * Scrub one range which can only has simple mirror based profile. 3170 + * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 3171 + * RAID0/RAID10). 3172 + * 3173 + * Since we may need to handle a subset of block group, we need @logical_start 3174 + * and @logical_length parameter. 3175 + */ 3176 + static int scrub_simple_mirror(struct scrub_ctx *sctx, 3177 + struct btrfs_root *extent_root, 3178 + struct btrfs_root *csum_root, 3179 + struct btrfs_block_group *bg, 3180 + struct map_lookup *map, 3181 + u64 logical_start, u64 logical_length, 3182 + struct btrfs_device *device, 3183 + u64 physical, int mirror_num) 3184 + { 3185 + struct btrfs_fs_info *fs_info = sctx->fs_info; 3186 + const u64 logical_end = logical_start + logical_length; 3187 + /* An artificial limit, inherit from old scrub behavior */ 3188 + const u32 max_length = SZ_64K; 3189 + struct btrfs_path path = { 0 }; 3190 + u64 cur_logical = logical_start; 3191 + int ret; 3192 + 3193 + /* The range must be inside the bg */ 3194 + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); 3195 + 3196 + path.search_commit_root = 1; 3197 + path.skip_locking = 1; 3198 + /* Go through each extent items inside the logical range */ 3199 + while (cur_logical < logical_end) { 3200 + u64 extent_start; 3201 + u64 extent_len; 3202 + u64 extent_flags; 3203 + u64 extent_gen; 3204 + u64 scrub_len; 3205 + 3206 + /* Canceled? */ 3207 + if (atomic_read(&fs_info->scrub_cancel_req) || 3208 + atomic_read(&sctx->cancel_req)) { 3209 + ret = -ECANCELED; 3210 + break; 3211 + } 3212 + /* Paused? */ 3213 + if (atomic_read(&fs_info->scrub_pause_req)) { 3214 + /* Push queued extents */ 3215 + sctx->flush_all_writes = true; 3216 + scrub_submit(sctx); 3217 + mutex_lock(&sctx->wr_lock); 3218 + scrub_wr_submit(sctx); 3219 + mutex_unlock(&sctx->wr_lock); 3220 + wait_event(sctx->list_wait, 3221 + atomic_read(&sctx->bios_in_flight) == 0); 3222 + sctx->flush_all_writes = false; 3223 + scrub_blocked_if_needed(fs_info); 3224 + } 3225 + /* Block group removed? */ 3226 + spin_lock(&bg->lock); 3227 + if (bg->removed) { 3228 + spin_unlock(&bg->lock); 3229 + ret = 0; 3230 + break; 3231 + } 3232 + spin_unlock(&bg->lock); 3233 + 3234 + ret = find_first_extent_item(extent_root, &path, cur_logical, 3235 + logical_end - cur_logical); 3236 + if (ret > 0) { 3237 + /* No more extent, just update the accounting */ 3238 + sctx->stat.last_physical = physical + logical_length; 3239 + ret = 0; 3240 + break; 3241 + } 3242 + if (ret < 0) 3243 + break; 3244 + get_extent_info(&path, &extent_start, &extent_len, 3245 + &extent_flags, &extent_gen); 3246 + /* Skip hole range which doesn't have any extent */ 3247 + cur_logical = max(extent_start, cur_logical); 3248 + 3249 + /* 3250 + * Scrub len has three limits: 3251 + * - Extent size limit 3252 + * - Scrub range limit 3253 + * This is especially imporatant for RAID0/RAID10 to reuse 3254 + * this function 3255 + * - Max scrub size limit 3256 + */ 3257 + scrub_len = min(min(extent_start + extent_len, 3258 + logical_end), cur_logical + max_length) - 3259 + cur_logical; 3260 + 3261 + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { 3262 + ret = btrfs_lookup_csums_range(csum_root, cur_logical, 3263 + cur_logical + scrub_len - 1, 3264 + &sctx->csum_list, 1); 3265 + if (ret) 3266 + break; 3267 + } 3268 + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3269 + does_range_cross_boundary(extent_start, extent_len, 3270 + logical_start, logical_length)) { 3271 + btrfs_err(fs_info, 3272 + "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", 3273 + extent_start, logical_start, logical_end); 3274 + spin_lock(&sctx->stat_lock); 3275 + sctx->stat.uncorrectable_errors++; 3276 + spin_unlock(&sctx->stat_lock); 3277 + cur_logical += scrub_len; 3278 + continue; 3279 + } 3280 + ret = scrub_extent(sctx, map, cur_logical, scrub_len, 3281 + cur_logical - logical_start + physical, 3282 + device, extent_flags, extent_gen, 3283 + mirror_num); 3284 + scrub_free_csums(sctx); 3285 + if (ret) 3286 + break; 3287 + if (sctx->is_dev_replace) 3288 + sync_replace_for_zoned(sctx); 3289 + cur_logical += scrub_len; 3290 + /* Don't hold CPU for too long time */ 3291 + cond_resched(); 3292 + } 3293 + btrfs_release_path(&path); 3294 + return ret; 3295 + } 3296 + 3297 + /* Calculate the full stripe length for simple stripe based profiles */ 3298 + static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) 3299 + { 3300 + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3301 + BTRFS_BLOCK_GROUP_RAID10)); 3302 + 3303 + return map->num_stripes / map->sub_stripes * map->stripe_len; 3304 + } 3305 + 3306 + /* Get the logical bytenr for the stripe */ 3307 + static u64 simple_stripe_get_logical(struct map_lookup *map, 3308 + struct btrfs_block_group *bg, 3309 + int stripe_index) 3310 + { 3311 + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3312 + BTRFS_BLOCK_GROUP_RAID10)); 3313 + ASSERT(stripe_index < map->num_stripes); 3314 + 3315 + /* 3316 + * (stripe_index / sub_stripes) gives how many data stripes we need to 3317 + * skip. 3318 + */ 3319 + return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; 3320 + } 3321 + 3322 + /* Get the mirror number for the stripe */ 3323 + static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) 3324 + { 3325 + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3326 + BTRFS_BLOCK_GROUP_RAID10)); 3327 + ASSERT(stripe_index < map->num_stripes); 3328 + 3329 + /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 3330 + return stripe_index % map->sub_stripes + 1; 3331 + } 3332 + 3333 + static int scrub_simple_stripe(struct scrub_ctx *sctx, 3334 + struct btrfs_root *extent_root, 3335 + struct btrfs_root *csum_root, 3336 + struct btrfs_block_group *bg, 3337 + struct map_lookup *map, 3338 + struct btrfs_device *device, 3339 + int stripe_index) 3340 + { 3341 + const u64 logical_increment = simple_stripe_full_stripe_len(map); 3342 + const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 3343 + const u64 orig_physical = map->stripes[stripe_index].physical; 3344 + const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 3345 + u64 cur_logical = orig_logical; 3346 + u64 cur_physical = orig_physical; 3347 + int ret = 0; 3348 + 3349 + while (cur_logical < bg->start + bg->length) { 3350 + /* 3351 + * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 3352 + * just RAID1, so we can reuse scrub_simple_mirror() to scrub 3353 + * this stripe. 3354 + */ 3355 + ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, 3356 + cur_logical, map->stripe_len, device, 3357 + cur_physical, mirror_num); 3358 + if (ret) 3359 + return ret; 3360 + /* Skip to next stripe which belongs to the target device */ 3361 + cur_logical += logical_increment; 3362 + /* For physical offset, we just go to next stripe */ 3363 + cur_physical += map->stripe_len; 3364 + } 3365 + return ret; 3366 + } 3367 + 3230 3368 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 3231 3369 struct btrfs_block_group *bg, 3232 3370 struct map_lookup *map, ··· 3437 3175 struct btrfs_fs_info *fs_info = sctx->fs_info; 3438 3176 struct btrfs_root *root; 3439 3177 struct btrfs_root *csum_root; 3440 - struct btrfs_extent_item *extent; 3441 3178 struct blk_plug plug; 3179 + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 3442 3180 const u64 chunk_logical = bg->start; 3443 - u64 flags; 3444 3181 int ret; 3445 - int slot; 3446 - u64 nstripes; 3447 - struct extent_buffer *l; 3448 - u64 physical; 3182 + u64 physical = map->stripes[stripe_index].physical; 3183 + const u64 physical_end = physical + dev_extent_len; 3449 3184 u64 logical; 3450 3185 u64 logic_end; 3451 - u64 physical_end; 3452 - u64 generation; 3453 - int mirror_num; 3454 - struct btrfs_key key; 3186 + /* The logical increment after finishing one stripe */ 3455 3187 u64 increment; 3188 + /* Offset inside the chunk */ 3456 3189 u64 offset; 3457 - u64 extent_logical; 3458 - u64 extent_physical; 3459 - /* 3460 - * Unlike chunk length, extent length should never go beyond 3461 - * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. 3462 - */ 3463 - u32 extent_len; 3464 3190 u64 stripe_logical; 3465 3191 u64 stripe_end; 3466 - struct btrfs_device *extent_dev; 3467 - int extent_mirror_num; 3468 3192 int stop_loop = 0; 3469 - 3470 - physical = map->stripes[stripe_index].physical; 3471 - offset = 0; 3472 - nstripes = div64_u64(dev_extent_len, map->stripe_len); 3473 - mirror_num = 1; 3474 - increment = map->stripe_len; 3475 - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3476 - offset = map->stripe_len * stripe_index; 3477 - increment = map->stripe_len * map->num_stripes; 3478 - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3479 - int factor = map->num_stripes / map->sub_stripes; 3480 - offset = map->stripe_len * (stripe_index / map->sub_stripes); 3481 - increment = map->stripe_len * factor; 3482 - mirror_num = stripe_index % map->sub_stripes + 1; 3483 - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 3484 - mirror_num = stripe_index % map->num_stripes + 1; 3485 - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3486 - mirror_num = stripe_index % map->num_stripes + 1; 3487 - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3488 - get_raid56_logic_offset(physical, stripe_index, map, &offset, 3489 - NULL); 3490 - increment = map->stripe_len * nr_data_stripes(map); 3491 - } 3492 3193 3493 3194 path = btrfs_alloc_path(); 3494 3195 if (!path) ··· 3466 3241 path->skip_locking = 1; 3467 3242 path->reada = READA_FORWARD; 3468 3243 3469 - logical = chunk_logical + offset; 3470 - physical_end = physical + nstripes * map->stripe_len; 3471 - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3472 - get_raid56_logic_offset(physical_end, stripe_index, 3473 - map, &logic_end, NULL); 3474 - logic_end += chunk_logical; 3475 - } else { 3476 - logic_end = logical + increment * nstripes; 3477 - } 3478 3244 wait_event(sctx->list_wait, 3479 3245 atomic_read(&sctx->bios_in_flight) == 0); 3480 3246 scrub_blocked_if_needed(fs_info); 3481 3247 3482 - root = btrfs_extent_root(fs_info, logical); 3483 - csum_root = btrfs_csum_root(fs_info, logical); 3248 + root = btrfs_extent_root(fs_info, bg->start); 3249 + csum_root = btrfs_csum_root(fs_info, bg->start); 3484 3250 3485 3251 /* 3486 3252 * collect all data csums for the stripe to avoid seeking during ··· 3488 3272 } 3489 3273 3490 3274 /* 3491 - * now find all extents for each stripe and scrub them 3275 + * There used to be a big double loop to handle all profiles using the 3276 + * same routine, which grows larger and more gross over time. 3277 + * 3278 + * So here we handle each profile differently, so simpler profiles 3279 + * have simpler scrubbing function. 3492 3280 */ 3281 + if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 3282 + BTRFS_BLOCK_GROUP_RAID56_MASK))) { 3283 + /* 3284 + * Above check rules out all complex profile, the remaining 3285 + * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 3286 + * mirrored duplication without stripe. 3287 + * 3288 + * Only @physical and @mirror_num needs to calculated using 3289 + * @stripe_index. 3290 + */ 3291 + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, 3292 + bg->start, bg->length, scrub_dev, 3293 + map->stripes[stripe_index].physical, 3294 + stripe_index + 1); 3295 + offset = 0; 3296 + goto out; 3297 + } 3298 + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 3299 + ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, 3300 + scrub_dev, stripe_index); 3301 + offset = map->stripe_len * (stripe_index / map->sub_stripes); 3302 + goto out; 3303 + } 3304 + 3305 + /* Only RAID56 goes through the old code */ 3306 + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 3493 3307 ret = 0; 3308 + 3309 + /* Calculate the logical end of the stripe */ 3310 + get_raid56_logic_offset(physical_end, stripe_index, 3311 + map, &logic_end, NULL); 3312 + logic_end += chunk_logical; 3313 + 3314 + /* Initialize @offset in case we need to go to out: label */ 3315 + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 3316 + increment = map->stripe_len * nr_data_stripes(map); 3317 + 3318 + /* 3319 + * Due to the rotation, for RAID56 it's better to iterate each stripe 3320 + * using their physical offset. 3321 + */ 3494 3322 while (physical < physical_end) { 3495 - /* 3496 - * canceled? 3497 - */ 3498 - if (atomic_read(&fs_info->scrub_cancel_req) || 3499 - atomic_read(&sctx->cancel_req)) { 3500 - ret = -ECANCELED; 3501 - goto out; 3502 - } 3503 - /* 3504 - * check to see if we have to pause 3505 - */ 3506 - if (atomic_read(&fs_info->scrub_pause_req)) { 3507 - /* push queued extents */ 3508 - sctx->flush_all_writes = true; 3509 - scrub_submit(sctx); 3510 - mutex_lock(&sctx->wr_lock); 3511 - scrub_wr_submit(sctx); 3512 - mutex_unlock(&sctx->wr_lock); 3513 - wait_event(sctx->list_wait, 3514 - atomic_read(&sctx->bios_in_flight) == 0); 3515 - sctx->flush_all_writes = false; 3516 - scrub_blocked_if_needed(fs_info); 3517 - } 3518 - 3519 - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3520 - ret = get_raid56_logic_offset(physical, stripe_index, 3521 - map, &logical, 3522 - &stripe_logical); 3523 - logical += chunk_logical; 3524 - if (ret) { 3525 - /* it is parity strip */ 3526 - stripe_logical += chunk_logical; 3527 - stripe_end = stripe_logical + increment; 3528 - ret = scrub_raid56_parity(sctx, map, scrub_dev, 3529 - stripe_logical, 3530 - stripe_end); 3531 - if (ret) 3532 - goto out; 3533 - goto skip; 3534 - } 3535 - } 3536 - 3537 - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3538 - key.type = BTRFS_METADATA_ITEM_KEY; 3539 - else 3540 - key.type = BTRFS_EXTENT_ITEM_KEY; 3541 - key.objectid = logical; 3542 - key.offset = (u64)-1; 3543 - 3544 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3545 - if (ret < 0) 3546 - goto out; 3547 - 3548 - if (ret > 0) { 3549 - ret = btrfs_previous_extent_item(root, path, 0); 3550 - if (ret < 0) 3551 - goto out; 3552 - if (ret > 0) { 3553 - /* there's no smaller item, so stick with the 3554 - * larger one */ 3555 - btrfs_release_path(path); 3556 - ret = btrfs_search_slot(NULL, root, &key, 3557 - path, 0, 0); 3558 - if (ret < 0) 3559 - goto out; 3560 - } 3561 - } 3562 - 3563 - stop_loop = 0; 3564 - while (1) { 3565 - u64 bytes; 3566 - 3567 - l = path->nodes[0]; 3568 - slot = path->slots[0]; 3569 - if (slot >= btrfs_header_nritems(l)) { 3570 - ret = btrfs_next_leaf(root, path); 3571 - if (ret == 0) 3572 - continue; 3573 - if (ret < 0) 3574 - goto out; 3575 - 3576 - stop_loop = 1; 3577 - break; 3578 - } 3579 - btrfs_item_key_to_cpu(l, &key, slot); 3580 - 3581 - if (key.type != BTRFS_EXTENT_ITEM_KEY && 3582 - key.type != BTRFS_METADATA_ITEM_KEY) 3583 - goto next; 3584 - 3585 - if (key.type == BTRFS_METADATA_ITEM_KEY) 3586 - bytes = fs_info->nodesize; 3587 - else 3588 - bytes = key.offset; 3589 - 3590 - if (key.objectid + bytes <= logical) 3591 - goto next; 3592 - 3593 - if (key.objectid >= logical + map->stripe_len) { 3594 - /* out of this device extent */ 3595 - if (key.objectid >= logic_end) 3596 - stop_loop = 1; 3597 - break; 3598 - } 3599 - 3600 - /* 3601 - * If our block group was removed in the meanwhile, just 3602 - * stop scrubbing since there is no point in continuing. 3603 - * Continuing would prevent reusing its device extents 3604 - * for new block groups for a long time. 3605 - */ 3606 - spin_lock(&bg->lock); 3607 - if (bg->removed) { 3608 - spin_unlock(&bg->lock); 3609 - ret = 0; 3610 - goto out; 3611 - } 3612 - spin_unlock(&bg->lock); 3613 - 3614 - extent = btrfs_item_ptr(l, slot, 3615 - struct btrfs_extent_item); 3616 - flags = btrfs_extent_flags(l, extent); 3617 - generation = btrfs_extent_generation(l, extent); 3618 - 3619 - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3620 - (key.objectid < logical || 3621 - key.objectid + bytes > 3622 - logical + map->stripe_len)) { 3623 - btrfs_err(fs_info, 3624 - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3625 - key.objectid, logical); 3626 - spin_lock(&sctx->stat_lock); 3627 - sctx->stat.uncorrectable_errors++; 3628 - spin_unlock(&sctx->stat_lock); 3629 - goto next; 3630 - } 3631 - 3632 - again: 3633 - extent_logical = key.objectid; 3634 - ASSERT(bytes <= U32_MAX); 3635 - extent_len = bytes; 3636 - 3637 - /* 3638 - * trim extent to this stripe 3639 - */ 3640 - if (extent_logical < logical) { 3641 - extent_len -= logical - extent_logical; 3642 - extent_logical = logical; 3643 - } 3644 - if (extent_logical + extent_len > 3645 - logical + map->stripe_len) { 3646 - extent_len = logical + map->stripe_len - 3647 - extent_logical; 3648 - } 3649 - 3650 - extent_physical = extent_logical - logical + physical; 3651 - extent_dev = scrub_dev; 3652 - extent_mirror_num = mirror_num; 3653 - if (sctx->is_dev_replace) 3654 - scrub_remap_extent(fs_info, extent_logical, 3655 - extent_len, &extent_physical, 3656 - &extent_dev, 3657 - &extent_mirror_num); 3658 - 3659 - if (flags & BTRFS_EXTENT_FLAG_DATA) { 3660 - ret = btrfs_lookup_csums_range(csum_root, 3661 - extent_logical, 3662 - extent_logical + extent_len - 1, 3663 - &sctx->csum_list, 1); 3664 - if (ret) 3665 - goto out; 3666 - } 3667 - 3668 - ret = scrub_extent(sctx, map, extent_logical, extent_len, 3669 - extent_physical, extent_dev, flags, 3670 - generation, extent_mirror_num, 3671 - extent_logical - logical + physical); 3672 - 3673 - scrub_free_csums(sctx); 3674 - 3323 + ret = get_raid56_logic_offset(physical, stripe_index, map, 3324 + &logical, &stripe_logical); 3325 + logical += chunk_logical; 3326 + if (ret) { 3327 + /* it is parity strip */ 3328 + stripe_logical += chunk_logical; 3329 + stripe_end = stripe_logical + increment; 3330 + ret = scrub_raid56_parity(sctx, map, scrub_dev, 3331 + stripe_logical, 3332 + stripe_end); 3675 3333 if (ret) 3676 3334 goto out; 3677 - 3678 - if (sctx->is_dev_replace) 3679 - sync_replace_for_zoned(sctx); 3680 - 3681 - if (extent_logical + extent_len < 3682 - key.objectid + bytes) { 3683 - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3684 - /* 3685 - * loop until we find next data stripe 3686 - * or we have finished all stripes. 3687 - */ 3688 - loop: 3689 - physical += map->stripe_len; 3690 - ret = get_raid56_logic_offset(physical, 3691 - stripe_index, map, 3692 - &logical, &stripe_logical); 3693 - logical += chunk_logical; 3694 - 3695 - if (ret && physical < physical_end) { 3696 - stripe_logical += chunk_logical; 3697 - stripe_end = stripe_logical + 3698 - increment; 3699 - ret = scrub_raid56_parity(sctx, 3700 - map, scrub_dev, 3701 - stripe_logical, 3702 - stripe_end); 3703 - if (ret) 3704 - goto out; 3705 - goto loop; 3706 - } 3707 - } else { 3708 - physical += map->stripe_len; 3709 - logical += increment; 3710 - } 3711 - if (logical < key.objectid + bytes) { 3712 - cond_resched(); 3713 - goto again; 3714 - } 3715 - 3716 - if (physical >= physical_end) { 3717 - stop_loop = 1; 3718 - break; 3719 - } 3720 - } 3721 - next: 3722 - path->slots[0]++; 3335 + goto next; 3723 3336 } 3724 - btrfs_release_path(path); 3725 - skip: 3337 + 3338 + /* 3339 + * Now we're at a data stripe, scrub each extents in the range. 3340 + * 3341 + * At this stage, if we ignore the repair part, inside each data 3342 + * stripe it is no different than SINGLE profile. 3343 + * We can reuse scrub_simple_mirror() here, as the repair part 3344 + * is still based on @mirror_num. 3345 + */ 3346 + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, 3347 + logical, map->stripe_len, 3348 + scrub_dev, physical, 1); 3349 + if (ret < 0) 3350 + goto out; 3351 + next: 3726 3352 logical += increment; 3727 3353 physical += map->stripe_len; 3728 3354 spin_lock(&sctx->stat_lock); ··· 4022 3964 if (!btrfs_check_super_location(scrub_dev, bytenr)) 4023 3965 continue; 4024 3966 4025 - ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 4026 - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 4027 - NULL, bytenr); 3967 + ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 3968 + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 3969 + NULL, bytenr); 4028 3970 if (ret) 4029 3971 return ret; 4030 3972 } ··· 4037 3979 { 4038 3980 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 4039 3981 &fs_info->scrub_lock)) { 4040 - struct btrfs_workqueue *scrub_workers = NULL; 4041 - struct btrfs_workqueue *scrub_wr_comp = NULL; 4042 - struct btrfs_workqueue *scrub_parity = NULL; 4043 - 4044 - scrub_workers = fs_info->scrub_workers; 4045 - scrub_wr_comp = fs_info->scrub_wr_completion_workers; 4046 - scrub_parity = fs_info->scrub_parity_workers; 3982 + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 3983 + struct workqueue_struct *scrub_wr_comp = 3984 + fs_info->scrub_wr_completion_workers; 3985 + struct workqueue_struct *scrub_parity = 3986 + fs_info->scrub_parity_workers; 4047 3987 4048 3988 fs_info->scrub_workers = NULL; 4049 3989 fs_info->scrub_wr_completion_workers = NULL; 4050 3990 fs_info->scrub_parity_workers = NULL; 4051 3991 mutex_unlock(&fs_info->scrub_lock); 4052 3992 4053 - btrfs_destroy_workqueue(scrub_workers); 4054 - btrfs_destroy_workqueue(scrub_wr_comp); 4055 - btrfs_destroy_workqueue(scrub_parity); 3993 + if (scrub_workers) 3994 + destroy_workqueue(scrub_workers); 3995 + if (scrub_wr_comp) 3996 + destroy_workqueue(scrub_wr_comp); 3997 + if (scrub_parity) 3998 + destroy_workqueue(scrub_parity); 4056 3999 } 4057 4000 } 4058 4001 ··· 4063 4004 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 4064 4005 int is_dev_replace) 4065 4006 { 4066 - struct btrfs_workqueue *scrub_workers = NULL; 4067 - struct btrfs_workqueue *scrub_wr_comp = NULL; 4068 - struct btrfs_workqueue *scrub_parity = NULL; 4007 + struct workqueue_struct *scrub_workers = NULL; 4008 + struct workqueue_struct *scrub_wr_comp = NULL; 4009 + struct workqueue_struct *scrub_parity = NULL; 4069 4010 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 4070 4011 int max_active = fs_info->thread_pool_size; 4071 4012 int ret = -ENOMEM; ··· 4073 4014 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 4074 4015 return 0; 4075 4016 4076 - scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, 4077 - is_dev_replace ? 1 : max_active, 4); 4017 + scrub_workers = alloc_workqueue("btrfs-scrub", flags, 4018 + is_dev_replace ? 1 : max_active); 4078 4019 if (!scrub_workers) 4079 4020 goto fail_scrub_workers; 4080 4021 4081 - scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, 4082 - max_active, 2); 4022 + scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); 4083 4023 if (!scrub_wr_comp) 4084 4024 goto fail_scrub_wr_completion_workers; 4085 4025 4086 - scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, 4087 - max_active, 2); 4026 + scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); 4088 4027 if (!scrub_parity) 4089 4028 goto fail_scrub_parity_workers; 4090 4029 ··· 4103 4046 mutex_unlock(&fs_info->scrub_lock); 4104 4047 4105 4048 ret = 0; 4106 - btrfs_destroy_workqueue(scrub_parity); 4049 + destroy_workqueue(scrub_parity); 4107 4050 fail_scrub_parity_workers: 4108 - btrfs_destroy_workqueue(scrub_wr_comp); 4051 + destroy_workqueue(scrub_wr_comp); 4109 4052 fail_scrub_wr_completion_workers: 4110 - btrfs_destroy_workqueue(scrub_workers); 4053 + destroy_workqueue(scrub_workers); 4111 4054 fail_scrub_workers: 4112 4055 return ret; 4113 4056 } ··· 4139 4082 } 4140 4083 4141 4084 if (fs_info->nodesize > 4142 - PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 4143 - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 4085 + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || 4086 + fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { 4144 4087 /* 4145 - * would exhaust the array bounds of pagev member in 4088 + * Would exhaust the array bounds of sectorv member in 4146 4089 * struct scrub_block 4147 4090 */ 4148 4091 btrfs_err(fs_info, 4149 - "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", 4150 - fs_info->nodesize, 4151 - SCRUB_MAX_PAGES_PER_BLOCK, 4152 - fs_info->sectorsize, 4153 - SCRUB_MAX_PAGES_PER_BLOCK); 4092 + "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", 4093 + fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, 4094 + fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); 4154 4095 return -EINVAL; 4155 4096 } 4156 4097 ··· 4216 4161 /* 4217 4162 * In order to avoid deadlock with reclaim when there is a transaction 4218 4163 * trying to pause scrub, make sure we use GFP_NOFS for all the 4219 - * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() 4164 + * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 4220 4165 * invoked by our callees. The pausing request is done when the 4221 4166 * transaction commit starts, and it blocks the transaction until scrub 4222 4167 * is paused (done at specific points at scrub_stripe() or right above ··· 4350 4295 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 4351 4296 } 4352 4297 4353 - static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 4354 - u64 extent_logical, u32 extent_len, 4355 - u64 *extent_physical, 4356 - struct btrfs_device **extent_dev, 4357 - int *extent_mirror_num) 4298 + static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, 4299 + u64 extent_logical, u32 extent_len, 4300 + u64 *extent_physical, 4301 + struct btrfs_device **extent_dev, 4302 + int *extent_mirror_num) 4358 4303 { 4359 4304 u64 mapped_length; 4360 4305 struct btrfs_io_context *bioc = NULL;

+193 -207

fs/btrfs/send.c

··· 10 10 #include <linux/mount.h> 11 11 #include <linux/xattr.h> 12 12 #include <linux/posix_acl_xattr.h> 13 - #include <linux/radix-tree.h> 14 13 #include <linux/vmalloc.h> 15 14 #include <linux/string.h> 16 15 #include <linux/compat.h> ··· 127 128 struct list_head new_refs; 128 129 struct list_head deleted_refs; 129 130 130 - struct radix_tree_root name_cache; 131 + struct xarray name_cache; 131 132 struct list_head name_cache_list; 132 133 int name_cache_size; 133 134 135 + /* 136 + * The inode we are currently processing. It's not NULL only when we 137 + * need to issue write commands for data extents from this inode. 138 + */ 139 + struct inode *cur_inode; 134 140 struct file_ra_state ra; 141 + u64 page_cache_clear_start; 142 + bool clean_page_cache; 135 143 136 144 /* 137 145 * We process inodes by their increasing order, so if before an ··· 268 262 struct name_cache_entry { 269 263 struct list_head list; 270 264 /* 271 - * radix_tree has only 32bit entries but we need to handle 64bit inums. 272 - * We use the lower 32bit of the 64bit inum to store it in the tree. If 273 - * more then one inum would fall into the same entry, we use radix_list 274 - * to store the additional entries. radix_list is also used to store 275 - * entries where two entries have the same inum but different 276 - * generations. 265 + * On 32bit kernels, xarray has only 32bit indices, but we need to 266 + * handle 64bit inums. We use the lower 32bit of the 64bit inum to store 267 + * it in the tree. If more than one inum would fall into the same entry, 268 + * we use inum_aliases to store the additional entries. inum_aliases is 269 + * also used to store entries with the same inum but different generations. 277 270 */ 278 - struct list_head radix_list; 271 + struct list_head inum_aliases; 279 272 u64 ino; 280 273 u64 gen; 281 274 u64 parent_ino; ··· 2024 2019 } 2025 2020 2026 2021 /* 2027 - * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit, 2022 + * Insert a name cache entry. On 32bit kernels the xarray index is 32bit, 2028 2023 * so we need to do some special handling in case we have clashes. This function 2029 - * takes care of this with the help of name_cache_entry::radix_list. 2024 + * takes care of this with the help of name_cache_entry::inum_aliases. 2030 2025 * In case of error, nce is kfreed. 2031 2026 */ 2032 2027 static int name_cache_insert(struct send_ctx *sctx, ··· 2035 2030 int ret = 0; 2036 2031 struct list_head *nce_head; 2037 2032 2038 - nce_head = radix_tree_lookup(&sctx->name_cache, 2039 - (unsigned long)nce->ino); 2033 + nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); 2040 2034 if (!nce_head) { 2041 2035 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL); 2042 2036 if (!nce_head) { ··· 2044 2040 } 2045 2041 INIT_LIST_HEAD(nce_head); 2046 2042 2047 - ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); 2043 + ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL); 2048 2044 if (ret < 0) { 2049 2045 kfree(nce_head); 2050 2046 kfree(nce); 2051 2047 return ret; 2052 2048 } 2053 2049 } 2054 - list_add_tail(&nce->radix_list, nce_head); 2050 + list_add_tail(&nce->inum_aliases, nce_head); 2055 2051 list_add_tail(&nce->list, &sctx->name_cache_list); 2056 2052 sctx->name_cache_size++; 2057 2053 ··· 2063 2059 { 2064 2060 struct list_head *nce_head; 2065 2061 2066 - nce_head = radix_tree_lookup(&sctx->name_cache, 2067 - (unsigned long)nce->ino); 2062 + nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino); 2068 2063 if (!nce_head) { 2069 2064 btrfs_err(sctx->send_root->fs_info, 2070 2065 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory", 2071 2066 nce->ino, sctx->name_cache_size); 2072 2067 } 2073 2068 2074 - list_del(&nce->radix_list); 2069 + list_del(&nce->inum_aliases); 2075 2070 list_del(&nce->list); 2076 2071 sctx->name_cache_size--; 2077 2072 ··· 2078 2075 * We may not get to the final release of nce_head if the lookup fails 2079 2076 */ 2080 2077 if (nce_head && list_empty(nce_head)) { 2081 - radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino); 2078 + xa_erase(&sctx->name_cache, (unsigned long)nce->ino); 2082 2079 kfree(nce_head); 2083 2080 } 2084 2081 } ··· 2089 2086 struct list_head *nce_head; 2090 2087 struct name_cache_entry *cur; 2091 2088 2092 - nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino); 2089 + nce_head = xa_load(&sctx->name_cache, (unsigned long)ino); 2093 2090 if (!nce_head) 2094 2091 return NULL; 2095 2092 2096 - list_for_each_entry(cur, nce_head, radix_list) { 2093 + list_for_each_entry(cur, nce_head, inum_aliases) { 2097 2094 if (cur->ino == ino && cur->gen == gen) 2098 2095 return cur; 2099 2096 } ··· 2678 2675 static int did_create_dir(struct send_ctx *sctx, u64 dir) 2679 2676 { 2680 2677 int ret = 0; 2678 + int iter_ret = 0; 2681 2679 struct btrfs_path *path = NULL; 2682 2680 struct btrfs_key key; 2683 2681 struct btrfs_key found_key; 2684 2682 struct btrfs_key di_key; 2685 - struct extent_buffer *eb; 2686 2683 struct btrfs_dir_item *di; 2687 - int slot; 2688 2684 2689 2685 path = alloc_path_for_send(); 2690 - if (!path) { 2691 - ret = -ENOMEM; 2692 - goto out; 2693 - } 2686 + if (!path) 2687 + return -ENOMEM; 2694 2688 2695 2689 key.objectid = dir; 2696 2690 key.type = BTRFS_DIR_INDEX_KEY; 2697 2691 key.offset = 0; 2698 - ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); 2699 - if (ret < 0) 2700 - goto out; 2701 2692 2702 - while (1) { 2703 - eb = path->nodes[0]; 2704 - slot = path->slots[0]; 2705 - if (slot >= btrfs_header_nritems(eb)) { 2706 - ret = btrfs_next_leaf(sctx->send_root, path); 2707 - if (ret < 0) { 2708 - goto out; 2709 - } else if (ret > 0) { 2710 - ret = 0; 2711 - break; 2712 - } 2713 - continue; 2714 - } 2693 + btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { 2694 + struct extent_buffer *eb = path->nodes[0]; 2715 2695 2716 - btrfs_item_key_to_cpu(eb, &found_key, slot); 2717 2696 if (found_key.objectid != key.objectid || 2718 2697 found_key.type != key.type) { 2719 2698 ret = 0; 2720 - goto out; 2699 + break; 2721 2700 } 2722 2701 2723 - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2702 + di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); 2724 2703 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2725 2704 2726 2705 if (di_key.type != BTRFS_ROOT_ITEM_KEY && 2727 2706 di_key.objectid < sctx->send_progress) { 2728 2707 ret = 1; 2729 - goto out; 2708 + break; 2730 2709 } 2731 - 2732 - path->slots[0]++; 2733 2710 } 2711 + /* Catch error found during iteration */ 2712 + if (iter_ret < 0) 2713 + ret = iter_ret; 2734 2714 2735 - out: 2736 2715 btrfs_free_path(path); 2737 2716 return ret; 2738 2717 } ··· 2918 2933 u64 send_progress) 2919 2934 { 2920 2935 int ret = 0; 2936 + int iter_ret = 0; 2921 2937 struct btrfs_root *root = sctx->parent_root; 2922 2938 struct btrfs_path *path; 2923 2939 struct btrfs_key key; ··· 2945 2959 if (odi) 2946 2960 key.offset = odi->last_dir_index_offset; 2947 2961 2948 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2949 - if (ret < 0) 2950 - goto out; 2951 - 2952 - while (1) { 2962 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 2953 2963 struct waiting_dir_move *dm; 2954 2964 2955 - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2956 - ret = btrfs_next_leaf(root, path); 2957 - if (ret < 0) 2958 - goto out; 2959 - else if (ret > 0) 2960 - break; 2961 - continue; 2962 - } 2963 - btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2964 - path->slots[0]); 2965 2965 if (found_key.objectid != key.objectid || 2966 2966 found_key.type != key.type) 2967 2967 break; ··· 2982 3010 ret = 0; 2983 3011 goto out; 2984 3012 } 2985 - 2986 - path->slots[0]++; 3013 + } 3014 + if (iter_ret < 0) { 3015 + ret = iter_ret; 3016 + goto out; 2987 3017 } 2988 3018 free_orphan_dir_info(sctx, odi); 2989 3019 ··· 3553 3579 } 3554 3580 3555 3581 /* 3556 - * Check if ino ino1 is an ancestor of inode ino2 in the given root for any 3582 + * Check if inode ino1 is an ancestor of inode ino2 in the given root for any 3557 3583 * possible path (in case ino2 is not a directory and has multiple hard links). 3558 3584 * Return 1 if true, 0 if false and < 0 on error. 3559 3585 */ ··· 3565 3591 { 3566 3592 bool free_fs_path = false; 3567 3593 int ret = 0; 3594 + int iter_ret = 0; 3568 3595 struct btrfs_path *path = NULL; 3569 3596 struct btrfs_key key; 3570 3597 ··· 3586 3611 key.type = BTRFS_INODE_REF_KEY; 3587 3612 key.offset = 0; 3588 3613 3589 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3590 - if (ret < 0) 3591 - goto out; 3592 - 3593 - while (true) { 3614 + btrfs_for_each_slot(root, &key, &key, path, iter_ret) { 3594 3615 struct extent_buffer *leaf = path->nodes[0]; 3595 3616 int slot = path->slots[0]; 3596 3617 u32 cur_offset = 0; 3597 3618 u32 item_size; 3598 3619 3599 - if (slot >= btrfs_header_nritems(leaf)) { 3600 - ret = btrfs_next_leaf(root, path); 3601 - if (ret < 0) 3602 - goto out; 3603 - if (ret > 0) 3604 - break; 3605 - continue; 3606 - } 3607 - 3608 - btrfs_item_key_to_cpu(leaf, &key, slot); 3609 3620 if (key.objectid != ino2) 3610 3621 break; 3611 3622 if (key.type != BTRFS_INODE_REF_KEY && ··· 3629 3668 if (ret) 3630 3669 goto out; 3631 3670 } 3632 - path->slots[0]++; 3633 3671 } 3634 3672 ret = 0; 3635 - out: 3673 + if (iter_ret < 0) 3674 + ret = iter_ret; 3675 + 3676 + out: 3636 3677 btrfs_free_path(path); 3637 3678 if (free_fs_path) 3638 3679 fs_path_free(fs_path); ··· 4514 4551 static int process_all_refs(struct send_ctx *sctx, 4515 4552 enum btrfs_compare_tree_result cmd) 4516 4553 { 4517 - int ret; 4554 + int ret = 0; 4555 + int iter_ret = 0; 4518 4556 struct btrfs_root *root; 4519 4557 struct btrfs_path *path; 4520 4558 struct btrfs_key key; 4521 4559 struct btrfs_key found_key; 4522 - struct extent_buffer *eb; 4523 - int slot; 4524 4560 iterate_inode_ref_t cb; 4525 4561 int pending_move = 0; 4526 4562 ··· 4543 4581 key.objectid = sctx->cmp_key->objectid; 4544 4582 key.type = BTRFS_INODE_REF_KEY; 4545 4583 key.offset = 0; 4546 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4547 - if (ret < 0) 4548 - goto out; 4549 - 4550 - while (1) { 4551 - eb = path->nodes[0]; 4552 - slot = path->slots[0]; 4553 - if (slot >= btrfs_header_nritems(eb)) { 4554 - ret = btrfs_next_leaf(root, path); 4555 - if (ret < 0) 4556 - goto out; 4557 - else if (ret > 0) 4558 - break; 4559 - continue; 4560 - } 4561 - 4562 - btrfs_item_key_to_cpu(eb, &found_key, slot); 4563 - 4584 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 4564 4585 if (found_key.objectid != key.objectid || 4565 4586 (found_key.type != BTRFS_INODE_REF_KEY && 4566 4587 found_key.type != BTRFS_INODE_EXTREF_KEY)) ··· 4552 4607 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 4553 4608 if (ret < 0) 4554 4609 goto out; 4555 - 4556 - path->slots[0]++; 4610 + } 4611 + /* Catch error found during iteration */ 4612 + if (iter_ret < 0) { 4613 + ret = iter_ret; 4614 + goto out; 4557 4615 } 4558 4616 btrfs_release_path(path); 4559 4617 ··· 4818 4870 4819 4871 static int process_all_new_xattrs(struct send_ctx *sctx) 4820 4872 { 4821 - int ret; 4873 + int ret = 0; 4874 + int iter_ret = 0; 4822 4875 struct btrfs_root *root; 4823 4876 struct btrfs_path *path; 4824 4877 struct btrfs_key key; 4825 4878 struct btrfs_key found_key; 4826 - struct extent_buffer *eb; 4827 - int slot; 4828 4879 4829 4880 path = alloc_path_for_send(); 4830 4881 if (!path) ··· 4834 4887 key.objectid = sctx->cmp_key->objectid; 4835 4888 key.type = BTRFS_XATTR_ITEM_KEY; 4836 4889 key.offset = 0; 4837 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4838 - if (ret < 0) 4839 - goto out; 4840 - 4841 - while (1) { 4842 - eb = path->nodes[0]; 4843 - slot = path->slots[0]; 4844 - if (slot >= btrfs_header_nritems(eb)) { 4845 - ret = btrfs_next_leaf(root, path); 4846 - if (ret < 0) { 4847 - goto out; 4848 - } else if (ret > 0) { 4849 - ret = 0; 4850 - break; 4851 - } 4852 - continue; 4853 - } 4854 - 4855 - btrfs_item_key_to_cpu(eb, &found_key, slot); 4890 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 4856 4891 if (found_key.objectid != key.objectid || 4857 4892 found_key.type != key.type) { 4858 4893 ret = 0; 4859 - goto out; 4894 + break; 4860 4895 } 4861 4896 4862 4897 ret = iterate_dir_item(root, path, __process_new_xattr, sctx); 4863 4898 if (ret < 0) 4864 - goto out; 4865 - 4866 - path->slots[0]++; 4899 + break; 4867 4900 } 4901 + /* Catch error found during iteration */ 4902 + if (iter_ret < 0) 4903 + ret = iter_ret; 4868 4904 4869 - out: 4870 4905 btrfs_free_path(path); 4871 4906 return ret; 4872 4907 } ··· 4875 4946 { 4876 4947 struct btrfs_root *root = sctx->send_root; 4877 4948 struct btrfs_fs_info *fs_info = root->fs_info; 4878 - struct inode *inode; 4879 4949 struct page *page; 4880 4950 pgoff_t index = offset >> PAGE_SHIFT; 4881 4951 pgoff_t last_index; ··· 4885 4957 if (ret) 4886 4958 return ret; 4887 4959 4888 - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); 4889 - if (IS_ERR(inode)) 4890 - return PTR_ERR(inode); 4891 - 4892 4960 last_index = (offset + len - 1) >> PAGE_SHIFT; 4893 - 4894 - /* initial readahead */ 4895 - memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 4896 - file_ra_state_init(&sctx->ra, inode->i_mapping); 4897 4961 4898 4962 while (index <= last_index) { 4899 4963 unsigned cur_len = min_t(unsigned, len, 4900 4964 PAGE_SIZE - pg_offset); 4901 4965 4902 - page = find_lock_page(inode->i_mapping, index); 4966 + page = find_lock_page(sctx->cur_inode->i_mapping, index); 4903 4967 if (!page) { 4904 - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, 4905 - NULL, index, last_index + 1 - index); 4968 + page_cache_sync_readahead(sctx->cur_inode->i_mapping, 4969 + &sctx->ra, NULL, index, 4970 + last_index + 1 - index); 4906 4971 4907 - page = find_or_create_page(inode->i_mapping, index, 4908 - GFP_KERNEL); 4972 + page = find_or_create_page(sctx->cur_inode->i_mapping, 4973 + index, GFP_KERNEL); 4909 4974 if (!page) { 4910 4975 ret = -ENOMEM; 4911 4976 break; 4912 4977 } 4913 4978 } 4914 4979 4915 - if (PageReadahead(page)) { 4916 - page_cache_async_readahead(inode->i_mapping, &sctx->ra, 4917 - NULL, page, index, last_index + 1 - index); 4918 - } 4980 + if (PageReadahead(page)) 4981 + page_cache_async_readahead(sctx->cur_inode->i_mapping, 4982 + &sctx->ra, NULL, page, index, 4983 + last_index + 1 - index); 4919 4984 4920 4985 if (!PageUptodate(page)) { 4921 4986 btrfs_readpage(NULL, page); ··· 4934 5013 len -= cur_len; 4935 5014 sctx->send_size += cur_len; 4936 5015 } 4937 - iput(inode); 5016 + 4938 5017 return ret; 4939 5018 } 4940 5019 ··· 5141 5220 const u64 offset, 5142 5221 const u64 len) 5143 5222 { 5223 + const u64 end = offset + len; 5144 5224 u64 read_size = max_send_read_size(sctx); 5145 5225 u64 sent = 0; 5146 5226 5147 5227 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5148 5228 return send_update_extent(sctx, offset, len); 5229 + 5230 + if (sctx->cur_inode == NULL) { 5231 + struct btrfs_root *root = sctx->send_root; 5232 + 5233 + sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); 5234 + if (IS_ERR(sctx->cur_inode)) { 5235 + int err = PTR_ERR(sctx->cur_inode); 5236 + 5237 + sctx->cur_inode = NULL; 5238 + return err; 5239 + } 5240 + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 5241 + file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); 5242 + 5243 + /* 5244 + * It's very likely there are no pages from this inode in the page 5245 + * cache, so after reading extents and sending their data, we clean 5246 + * the page cache to avoid trashing the page cache (adding pressure 5247 + * to the page cache and forcing eviction of other data more useful 5248 + * for applications). 5249 + * 5250 + * We decide if we should clean the page cache simply by checking 5251 + * if the inode's mapping nrpages is 0 when we first open it, and 5252 + * not by using something like filemap_range_has_page() before 5253 + * reading an extent because when we ask the readahead code to 5254 + * read a given file range, it may (and almost always does) read 5255 + * pages from beyond that range (see the documentation for 5256 + * page_cache_sync_readahead()), so it would not be reliable, 5257 + * because after reading the first extent future calls to 5258 + * filemap_range_has_page() would return true because the readahead 5259 + * on the previous extent resulted in reading pages of the current 5260 + * extent as well. 5261 + */ 5262 + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); 5263 + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); 5264 + } 5149 5265 5150 5266 while (sent < len) { 5151 5267 u64 size = min(len - sent, read_size); ··· 5193 5235 return ret; 5194 5236 sent += size; 5195 5237 } 5238 + 5239 + if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { 5240 + /* 5241 + * Always operate only on ranges that are a multiple of the page 5242 + * size. This is not only to prevent zeroing parts of a page in 5243 + * the case of subpage sector size, but also to guarantee we evict 5244 + * pages, as passing a range that is smaller than page size does 5245 + * not evict the respective page (only zeroes part of its content). 5246 + * 5247 + * Always start from the end offset of the last range cleared. 5248 + * This is because the readahead code may (and very often does) 5249 + * reads pages beyond the range we request for readahead. So if 5250 + * we have an extent layout like this: 5251 + * 5252 + * [ extent A ] [ extent B ] [ extent C ] 5253 + * 5254 + * When we ask page_cache_sync_readahead() to read extent A, it 5255 + * may also trigger reads for pages of extent B. If we are doing 5256 + * an incremental send and extent B has not changed between the 5257 + * parent and send snapshots, some or all of its pages may end 5258 + * up being read and placed in the page cache. So when truncating 5259 + * the page cache we always start from the end offset of the 5260 + * previously processed extent up to the end of the current 5261 + * extent. 5262 + */ 5263 + truncate_inode_pages_range(&sctx->cur_inode->i_data, 5264 + sctx->page_cache_clear_start, 5265 + end - 1); 5266 + sctx->page_cache_clear_start = end; 5267 + } 5268 + 5196 5269 return 0; 5197 5270 } 5198 5271 ··· 5954 5965 5955 5966 static int process_all_extents(struct send_ctx *sctx) 5956 5967 { 5957 - int ret; 5968 + int ret = 0; 5969 + int iter_ret = 0; 5958 5970 struct btrfs_root *root; 5959 5971 struct btrfs_path *path; 5960 5972 struct btrfs_key key; 5961 5973 struct btrfs_key found_key; 5962 - struct extent_buffer *eb; 5963 - int slot; 5964 5974 5965 5975 root = sctx->send_root; 5966 5976 path = alloc_path_for_send(); ··· 5969 5981 key.objectid = sctx->cmp_key->objectid; 5970 5982 key.type = BTRFS_EXTENT_DATA_KEY; 5971 5983 key.offset = 0; 5972 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5973 - if (ret < 0) 5974 - goto out; 5975 - 5976 - while (1) { 5977 - eb = path->nodes[0]; 5978 - slot = path->slots[0]; 5979 - 5980 - if (slot >= btrfs_header_nritems(eb)) { 5981 - ret = btrfs_next_leaf(root, path); 5982 - if (ret < 0) { 5983 - goto out; 5984 - } else if (ret > 0) { 5985 - ret = 0; 5986 - break; 5987 - } 5988 - continue; 5989 - } 5990 - 5991 - btrfs_item_key_to_cpu(eb, &found_key, slot); 5992 - 5984 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 5993 5985 if (found_key.objectid != key.objectid || 5994 5986 found_key.type != key.type) { 5995 5987 ret = 0; 5996 - goto out; 5988 + break; 5997 5989 } 5998 5990 5999 5991 ret = process_extent(sctx, path, &found_key); 6000 5992 if (ret < 0) 6001 - goto out; 6002 - 6003 - path->slots[0]++; 5993 + break; 6004 5994 } 5995 + /* Catch error found during iteration */ 5996 + if (iter_ret < 0) 5997 + ret = iter_ret; 6005 5998 6006 - out: 6007 5999 btrfs_free_path(path); 6008 6000 return ret; 6009 6001 } ··· 6173 6205 { 6174 6206 LIST_HEAD(deleted_refs); 6175 6207 struct btrfs_path *path; 6208 + struct btrfs_root *root = sctx->parent_root; 6176 6209 struct btrfs_key key; 6210 + struct btrfs_key found_key; 6177 6211 struct parent_paths_ctx ctx; 6212 + int iter_ret = 0; 6178 6213 int ret; 6179 6214 6180 6215 path = alloc_path_for_send(); ··· 6187 6216 key.objectid = sctx->cur_ino; 6188 6217 key.type = BTRFS_INODE_REF_KEY; 6189 6218 key.offset = 0; 6190 - ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); 6191 - if (ret < 0) 6192 - goto out; 6193 6219 6194 6220 ctx.refs = &deleted_refs; 6195 6221 ctx.sctx = sctx; 6196 6222 6197 - while (true) { 6198 - struct extent_buffer *eb = path->nodes[0]; 6199 - int slot = path->slots[0]; 6200 - 6201 - if (slot >= btrfs_header_nritems(eb)) { 6202 - ret = btrfs_next_leaf(sctx->parent_root, path); 6203 - if (ret < 0) 6204 - goto out; 6205 - else if (ret > 0) 6206 - break; 6207 - continue; 6208 - } 6209 - 6210 - btrfs_item_key_to_cpu(eb, &key, slot); 6211 - if (key.objectid != sctx->cur_ino) 6223 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 6224 + if (found_key.objectid != key.objectid) 6212 6225 break; 6213 - if (key.type != BTRFS_INODE_REF_KEY && 6214 - key.type != BTRFS_INODE_EXTREF_KEY) 6226 + if (found_key.type != key.type && 6227 + found_key.type != BTRFS_INODE_EXTREF_KEY) 6215 6228 break; 6216 6229 6217 - ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, 6230 + ret = iterate_inode_ref(root, path, &found_key, 1, 6218 6231 record_parent_ref, &ctx); 6219 6232 if (ret < 0) 6220 6233 goto out; 6221 - 6222 - path->slots[0]++; 6234 + } 6235 + /* Catch error found during iteration */ 6236 + if (iter_ret < 0) { 6237 + ret = iter_ret; 6238 + goto out; 6223 6239 } 6224 6240 6225 6241 while (!list_empty(&deleted_refs)) { ··· 6228 6270 return ret; 6229 6271 } 6230 6272 6273 + static void close_current_inode(struct send_ctx *sctx) 6274 + { 6275 + u64 i_size; 6276 + 6277 + if (sctx->cur_inode == NULL) 6278 + return; 6279 + 6280 + i_size = i_size_read(sctx->cur_inode); 6281 + 6282 + /* 6283 + * If we are doing an incremental send, we may have extents between the 6284 + * last processed extent and the i_size that have not been processed 6285 + * because they haven't changed but we may have read some of their pages 6286 + * through readahead, see the comments at send_extent_data(). 6287 + */ 6288 + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) 6289 + truncate_inode_pages_range(&sctx->cur_inode->i_data, 6290 + sctx->page_cache_clear_start, 6291 + round_up(i_size, PAGE_SIZE) - 1); 6292 + 6293 + iput(sctx->cur_inode); 6294 + sctx->cur_inode = NULL; 6295 + } 6296 + 6231 6297 static int changed_inode(struct send_ctx *sctx, 6232 6298 enum btrfs_compare_tree_result result) 6233 6299 { ··· 6261 6279 struct btrfs_inode_item *right_ii = NULL; 6262 6280 u64 left_gen = 0; 6263 6281 u64 right_gen = 0; 6282 + 6283 + close_current_inode(sctx); 6264 6284 6265 6285 sctx->cur_ino = key->objectid; 6266 6286 sctx->cur_inode_new_gen = 0; ··· 7518 7534 7519 7535 INIT_LIST_HEAD(&sctx->new_refs); 7520 7536 INIT_LIST_HEAD(&sctx->deleted_refs); 7521 - INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); 7537 + xa_init_flags(&sctx->name_cache, GFP_KERNEL); 7522 7538 INIT_LIST_HEAD(&sctx->name_cache_list); 7523 7539 7524 7540 sctx->flags = arg->flags; ··· 7749 7765 kvfree(sctx->send_buf); 7750 7766 7751 7767 name_cache_free(sctx); 7768 + 7769 + close_current_inode(sctx); 7752 7770 7753 7771 kfree(sctx); 7754 7772 }

+10 -1

fs/btrfs/space-info.c

··· 181 181 found->full = 0; 182 182 } 183 183 184 + /* 185 + * Block groups with more than this value (percents) of unusable space will be 186 + * scheduled for background reclaim. 187 + */ 188 + #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) 189 + 184 190 static int create_space_info(struct btrfs_fs_info *info, u64 flags) 185 191 { 186 192 ··· 208 202 INIT_LIST_HEAD(&space_info->tickets); 209 203 INIT_LIST_HEAD(&space_info->priority_tickets); 210 204 space_info->clamp = 1; 205 + 206 + if (btrfs_is_zoned(info)) 207 + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; 211 208 212 209 ret = btrfs_sysfs_add_space_info_type(info, space_info); 213 210 if (ret) ··· 528 519 items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; 529 520 } 530 521 531 - trans = (struct btrfs_trans_handle *)current->journal_info; 522 + trans = current->journal_info; 532 523 533 524 /* 534 525 * If we are doing more ordered than delalloc we need to just wait on

+8

fs/btrfs/space-info.h

··· 3 3 #ifndef BTRFS_SPACE_INFO_H 4 4 #define BTRFS_SPACE_INFO_H 5 5 6 + #include "volumes.h" 7 + 6 8 struct btrfs_space_info { 7 9 spinlock_t lock; 8 10 ··· 25 23 u64 max_extent_size; /* This will hold the maximum extent size of 26 24 the space info if we had an ENOSPC in the 27 25 allocator. */ 26 + 27 + /* 28 + * Once a block group drops below this threshold (percents) we'll 29 + * schedule it for reclaim. 30 + */ 31 + int bg_reclaim_threshold; 28 32 29 33 int clamp; /* Used to scale our threshold for preemptive 30 34 flushing. The value is >> clamp, so turns

+39 -16

fs/btrfs/subpage.c

··· 63 63 * This means a slightly higher tree locking latency. 64 64 */ 65 65 66 + bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) 67 + { 68 + if (fs_info->sectorsize >= PAGE_SIZE) 69 + return false; 70 + 71 + /* 72 + * Only data pages (either through DIO or compression) can have no 73 + * mapping. And if page->mapping->host is data inode, it's subpage. 74 + * As we have ruled our sectorsize >= PAGE_SIZE case already. 75 + */ 76 + if (!page->mapping || !page->mapping->host || 77 + is_data_inode(page->mapping->host)) 78 + return true; 79 + 80 + /* 81 + * Now the only remaining case is metadata, which we only go subpage 82 + * routine if nodesize < PAGE_SIZE. 83 + */ 84 + if (fs_info->nodesize < PAGE_SIZE) 85 + return true; 86 + return false; 87 + } 88 + 66 89 void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) 67 90 { 68 91 unsigned int cur = 0; ··· 130 107 ASSERT(PageLocked(page)); 131 108 132 109 /* Either not subpage, or the page already has private attached */ 133 - if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 110 + if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) 134 111 return 0; 135 112 136 113 subpage = btrfs_alloc_subpage(fs_info, type); ··· 147 124 struct btrfs_subpage *subpage; 148 125 149 126 /* Either not subpage, or already detached */ 150 - if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 127 + if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) 151 128 return; 152 129 153 - subpage = (struct btrfs_subpage *)detach_page_private(page); 130 + subpage = detach_page_private(page); 154 131 ASSERT(subpage); 155 132 btrfs_free_subpage(subpage); 156 133 } ··· 198 175 { 199 176 struct btrfs_subpage *subpage; 200 177 201 - if (fs_info->sectorsize == PAGE_SIZE) 178 + if (!btrfs_is_subpage(fs_info, page)) 202 179 return; 203 180 204 181 ASSERT(PagePrivate(page) && page->mapping); ··· 213 190 { 214 191 struct btrfs_subpage *subpage; 215 192 216 - if (fs_info->sectorsize == PAGE_SIZE) 193 + if (!btrfs_is_subpage(fs_info, page)) 217 194 return; 218 195 219 196 ASSERT(PagePrivate(page) && page->mapping); ··· 342 319 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 343 320 struct page *page, u64 start, u32 len) 344 321 { 345 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 322 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { 346 323 lock_page(page); 347 324 return 0; 348 325 } ··· 359 336 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 360 337 struct page *page, u64 start, u32 len) 361 338 { 362 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 339 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) 363 340 return unlock_page(page); 364 341 btrfs_subpage_clamp_range(page, &start, &len); 365 342 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) ··· 643 620 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 644 621 struct page *page, u64 start, u32 len) \ 645 622 { \ 646 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 623 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ 647 624 set_page_func(page); \ 648 625 return; \ 649 626 } \ ··· 652 629 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 653 630 struct page *page, u64 start, u32 len) \ 654 631 { \ 655 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 632 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ 656 633 clear_page_func(page); \ 657 634 return; \ 658 635 } \ ··· 661 638 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 662 639 struct page *page, u64 start, u32 len) \ 663 640 { \ 664 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 641 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ 665 642 return test_page_func(page); \ 666 643 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 667 644 } \ 668 645 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 669 646 struct page *page, u64 start, u32 len) \ 670 647 { \ 671 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 648 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ 672 649 set_page_func(page); \ 673 650 return; \ 674 651 } \ ··· 678 655 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 679 656 struct page *page, u64 start, u32 len) \ 680 657 { \ 681 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 658 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ 682 659 clear_page_func(page); \ 683 660 return; \ 684 661 } \ ··· 688 665 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 689 666 struct page *page, u64 start, u32 len) \ 690 667 { \ 691 - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 668 + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ 692 669 return test_page_func(page); \ 693 670 btrfs_subpage_clamp_range(page, &start, &len); \ 694 671 return btrfs_subpage_test_##name(fs_info, page, start, len); \ ··· 717 694 return; 718 695 719 696 ASSERT(!PageDirty(page)); 720 - if (fs_info->sectorsize == PAGE_SIZE) 697 + if (!btrfs_is_subpage(fs_info, page)) 721 698 return; 722 699 723 700 ASSERT(PagePrivate(page) && page->private); ··· 745 722 struct btrfs_subpage *subpage; 746 723 747 724 ASSERT(PageLocked(page)); 748 - /* For regular page size case, we just unlock the page */ 749 - if (fs_info->sectorsize == PAGE_SIZE) 725 + /* For non-subpage case, we just unlock the page */ 726 + if (!btrfs_is_subpage(fs_info, page)) 750 727 return unlock_page(page); 751 728 752 729 ASSERT(PagePrivate(page) && page->private);

+2

fs/btrfs/subpage.h

··· 74 74 BTRFS_SUBPAGE_DATA, 75 75 }; 76 76 77 + bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); 78 + 77 79 void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); 78 80 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 79 81 struct page *page, enum btrfs_subpage_type type);

+4 -5

fs/btrfs/super.c

··· 261 261 RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), 262 262 }; 263 263 264 - void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 264 + void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 265 265 { 266 266 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; 267 267 struct va_format vaf; ··· 292 292 char statestr[STATE_STRING_BUF_LEN]; 293 293 294 294 btrfs_state_to_string(fs_info, statestr); 295 - printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, 295 + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, 296 296 fs_info->sb->s_id, statestr, &vaf); 297 297 } else { 298 - printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); 298 + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); 299 299 } 300 300 } 301 301 ··· 1903 1903 old_pool_size, new_pool_size); 1904 1904 1905 1905 btrfs_workqueue_set_max(fs_info->workers, new_pool_size); 1906 + btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); 1906 1907 btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); 1907 1908 btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); 1908 1909 btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); ··· 1913 1912 btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); 1914 1913 btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); 1915 1914 btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); 1916 - btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, 1917 - new_pool_size); 1918 1915 } 1919 1916 1920 1917 static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,

+39 -4

fs/btrfs/sysfs.c

··· 394 394 { 395 395 ssize_t ret = 0; 396 396 397 - /* 4K sector size is also supported with 64K page size */ 398 - if (PAGE_SIZE == SZ_64K) 397 + /* An artificial limit to only support 4K and PAGE_SIZE */ 398 + if (PAGE_SIZE > SZ_4K) 399 399 ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); 400 - 401 - /* Only sectorsize == PAGE_SIZE is now supported */ 402 400 ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); 403 401 404 402 return ret; ··· 720 722 SPACE_INFO_ATTR(disk_used); 721 723 SPACE_INFO_ATTR(disk_total); 722 724 725 + static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, 726 + struct kobj_attribute *a, 727 + char *buf) 728 + { 729 + struct btrfs_space_info *space_info = to_space_info(kobj); 730 + ssize_t ret; 731 + 732 + ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); 733 + 734 + return ret; 735 + } 736 + 737 + static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, 738 + struct kobj_attribute *a, 739 + const char *buf, size_t len) 740 + { 741 + struct btrfs_space_info *space_info = to_space_info(kobj); 742 + int thresh; 743 + int ret; 744 + 745 + ret = kstrtoint(buf, 10, &thresh); 746 + if (ret) 747 + return ret; 748 + 749 + if (thresh < 0 || thresh > 100) 750 + return -EINVAL; 751 + 752 + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); 753 + 754 + return len; 755 + } 756 + 757 + BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, 758 + btrfs_sinfo_bg_reclaim_threshold_show, 759 + btrfs_sinfo_bg_reclaim_threshold_store); 760 + 723 761 /* 724 762 * Allocation information about block group types. 725 763 * ··· 772 738 BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), 773 739 BTRFS_ATTR_PTR(space_info, disk_used), 774 740 BTRFS_ATTR_PTR(space_info, disk_total), 741 + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), 775 742 NULL, 776 743 }; 777 744 ATTRIBUTE_GROUPS(space_info);

+4 -20

fs/btrfs/tests/btrfs-tests.c

··· 150 150 151 151 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) 152 152 { 153 - struct radix_tree_iter iter; 154 - void **slot; 153 + unsigned long index; 154 + struct extent_buffer *eb; 155 155 struct btrfs_device *dev, *tmp; 156 156 157 157 if (!fs_info) ··· 163 163 164 164 test_mnt->mnt_sb->s_fs_info = NULL; 165 165 166 - spin_lock(&fs_info->buffer_lock); 167 - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { 168 - struct extent_buffer *eb; 169 - 170 - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); 171 - if (!eb) 172 - continue; 173 - /* Shouldn't happen but that kind of thinking creates CVE's */ 174 - if (radix_tree_exception(eb)) { 175 - if (radix_tree_deref_retry(eb)) 176 - slot = radix_tree_iter_retry(&iter); 177 - continue; 178 - } 179 - slot = radix_tree_iter_resume(slot, &iter); 180 - spin_unlock(&fs_info->buffer_lock); 166 + xa_for_each(&fs_info->extent_buffers, index, eb) { 181 167 free_extent_buffer_stale(eb); 182 - spin_lock(&fs_info->buffer_lock); 183 168 } 184 - spin_unlock(&fs_info->buffer_lock); 185 169 186 170 btrfs_mapping_tree_free(&fs_info->mapping_tree); 187 171 list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, ··· 186 202 if (!root) 187 203 return; 188 204 /* Will be freed by btrfs_free_fs_roots */ 189 - if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) 205 + if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state))) 190 206 return; 191 207 btrfs_global_root_delete(root); 192 208 btrfs_put_root(root);

+47 -61

fs/btrfs/transaction.c

··· 23 23 #include "space-info.h" 24 24 #include "zoned.h" 25 25 26 - #define BTRFS_ROOT_TRANS_TAG 0 26 + #define BTRFS_ROOT_TRANS_TAG XA_MARK_0 27 27 28 28 /* 29 29 * Transaction states and transitions ··· 221 221 * the caching thread will re-start it's search from 3, and thus find 222 222 * the hole from [4,6) to add to the free space cache. 223 223 */ 224 - spin_lock(&fs_info->block_group_cache_lock); 224 + write_lock(&fs_info->block_group_cache_lock); 225 225 list_for_each_entry_safe(caching_ctl, next, 226 226 &fs_info->caching_block_groups, list) { 227 227 struct btrfs_block_group *cache = caching_ctl->block_group; ··· 234 234 cache->last_byte_to_unpin = caching_ctl->progress; 235 235 } 236 236 } 237 - spin_unlock(&fs_info->block_group_cache_lock); 237 + write_unlock(&fs_info->block_group_cache_lock); 238 238 up_write(&fs_info->commit_root_sem); 239 239 } 240 240 ··· 437 437 */ 438 438 smp_wmb(); 439 439 440 - spin_lock(&fs_info->fs_roots_radix_lock); 440 + spin_lock(&fs_info->fs_roots_lock); 441 441 if (root->last_trans == trans->transid && !force) { 442 - spin_unlock(&fs_info->fs_roots_radix_lock); 442 + spin_unlock(&fs_info->fs_roots_lock); 443 443 return 0; 444 444 } 445 - radix_tree_tag_set(&fs_info->fs_roots_radix, 446 - (unsigned long)root->root_key.objectid, 447 - BTRFS_ROOT_TRANS_TAG); 448 - spin_unlock(&fs_info->fs_roots_radix_lock); 445 + xa_set_mark(&fs_info->fs_roots, 446 + (unsigned long)root->root_key.objectid, 447 + BTRFS_ROOT_TRANS_TAG); 448 + spin_unlock(&fs_info->fs_roots_lock); 449 449 root->last_trans = trans->transid; 450 450 451 451 /* this is pretty tricky. We don't want to ··· 487 487 spin_unlock(&cur_trans->dropped_roots_lock); 488 488 489 489 /* Make sure we don't try to update the root at commit time */ 490 - spin_lock(&fs_info->fs_roots_radix_lock); 491 - radix_tree_tag_clear(&fs_info->fs_roots_radix, 492 - (unsigned long)root->root_key.objectid, 493 - BTRFS_ROOT_TRANS_TAG); 494 - spin_unlock(&fs_info->fs_roots_radix_lock); 490 + xa_clear_mark(&fs_info->fs_roots, 491 + (unsigned long)root->root_key.objectid, 492 + BTRFS_ROOT_TRANS_TAG); 495 493 } 496 494 497 495 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, ··· 1402 1404 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) 1403 1405 { 1404 1406 struct btrfs_fs_info *fs_info = trans->fs_info; 1405 - struct btrfs_root *gang[8]; 1406 - int i; 1407 - int ret; 1407 + struct btrfs_root *root; 1408 + unsigned long index; 1408 1409 1409 1410 /* 1410 1411 * At this point no one can be using this transaction to modify any tree ··· 1411 1414 */ 1412 1415 ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); 1413 1416 1414 - spin_lock(&fs_info->fs_roots_radix_lock); 1415 - while (1) { 1416 - ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, 1417 - (void **)gang, 0, 1418 - ARRAY_SIZE(gang), 1419 - BTRFS_ROOT_TRANS_TAG); 1420 - if (ret == 0) 1421 - break; 1422 - for (i = 0; i < ret; i++) { 1423 - struct btrfs_root *root = gang[i]; 1424 - int ret2; 1417 + spin_lock(&fs_info->fs_roots_lock); 1418 + xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) { 1419 + int ret; 1425 1420 1426 - /* 1427 - * At this point we can neither have tasks logging inodes 1428 - * from a root nor trying to commit a log tree. 1429 - */ 1430 - ASSERT(atomic_read(&root->log_writers) == 0); 1431 - ASSERT(atomic_read(&root->log_commit[0]) == 0); 1432 - ASSERT(atomic_read(&root->log_commit[1]) == 0); 1421 + /* 1422 + * At this point we can neither have tasks logging inodes 1423 + * from a root nor trying to commit a log tree. 1424 + */ 1425 + ASSERT(atomic_read(&root->log_writers) == 0); 1426 + ASSERT(atomic_read(&root->log_commit[0]) == 0); 1427 + ASSERT(atomic_read(&root->log_commit[1]) == 0); 1433 1428 1434 - radix_tree_tag_clear(&fs_info->fs_roots_radix, 1435 - (unsigned long)root->root_key.objectid, 1436 - BTRFS_ROOT_TRANS_TAG); 1437 - spin_unlock(&fs_info->fs_roots_radix_lock); 1429 + xa_clear_mark(&fs_info->fs_roots, 1430 + (unsigned long)root->root_key.objectid, 1431 + BTRFS_ROOT_TRANS_TAG); 1432 + spin_unlock(&fs_info->fs_roots_lock); 1438 1433 1439 - btrfs_free_log(trans, root); 1440 - ret2 = btrfs_update_reloc_root(trans, root); 1441 - if (ret2) 1442 - return ret2; 1434 + btrfs_free_log(trans, root); 1435 + ret = btrfs_update_reloc_root(trans, root); 1436 + if (ret) 1437 + return ret; 1443 1438 1444 - /* see comments in should_cow_block() */ 1445 - clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1446 - smp_mb__after_atomic(); 1439 + /* See comments in should_cow_block() */ 1440 + clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); 1441 + smp_mb__after_atomic(); 1447 1442 1448 - if (root->commit_root != root->node) { 1449 - list_add_tail(&root->dirty_list, 1450 - &trans->transaction->switch_commits); 1451 - btrfs_set_root_node(&root->root_item, 1452 - root->node); 1453 - } 1454 - 1455 - ret2 = btrfs_update_root(trans, fs_info->tree_root, 1456 - &root->root_key, 1457 - &root->root_item); 1458 - if (ret2) 1459 - return ret2; 1460 - spin_lock(&fs_info->fs_roots_radix_lock); 1461 - btrfs_qgroup_free_meta_all_pertrans(root); 1443 + if (root->commit_root != root->node) { 1444 + list_add_tail(&root->dirty_list, 1445 + &trans->transaction->switch_commits); 1446 + btrfs_set_root_node(&root->root_item, root->node); 1462 1447 } 1448 + 1449 + ret = btrfs_update_root(trans, fs_info->tree_root, 1450 + &root->root_key, &root->root_item); 1451 + if (ret) 1452 + return ret; 1453 + spin_lock(&fs_info->fs_roots_lock); 1454 + btrfs_qgroup_free_meta_all_pertrans(root); 1463 1455 } 1464 - spin_unlock(&fs_info->fs_roots_radix_lock); 1456 + spin_unlock(&fs_info->fs_roots_lock); 1465 1457 return 0; 1466 1458 } 1467 1459

+55

fs/btrfs/tree-checker.c

··· 1855 1855 return ret; 1856 1856 } 1857 1857 ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); 1858 + 1859 + int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) 1860 + { 1861 + const bool is_subvol = is_fstree(root_owner); 1862 + const u64 eb_owner = btrfs_header_owner(eb); 1863 + 1864 + /* 1865 + * Skip dummy fs, as selftests don't create unique ebs for each dummy 1866 + * root. 1867 + */ 1868 + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) 1869 + return 0; 1870 + /* 1871 + * There are several call sites (backref walking, qgroup, and data 1872 + * reloc) passing 0 as @root_owner, as they are not holding the 1873 + * tree root. In that case, we can not do a reliable ownership check, 1874 + * so just exit. 1875 + */ 1876 + if (root_owner == 0) 1877 + return 0; 1878 + /* 1879 + * These trees use key.offset as their owner, our callers don't have 1880 + * the extra capacity to pass key.offset here. So we just skip them. 1881 + */ 1882 + if (root_owner == BTRFS_TREE_LOG_OBJECTID || 1883 + root_owner == BTRFS_TREE_RELOC_OBJECTID) 1884 + return 0; 1885 + 1886 + if (!is_subvol) { 1887 + /* For non-subvolume trees, the eb owner should match root owner */ 1888 + if (unlikely(root_owner != eb_owner)) { 1889 + btrfs_crit(eb->fs_info, 1890 + "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", 1891 + btrfs_header_level(eb) == 0 ? "leaf" : "node", 1892 + root_owner, btrfs_header_bytenr(eb), eb_owner, 1893 + root_owner); 1894 + return -EUCLEAN; 1895 + } 1896 + return 0; 1897 + } 1898 + 1899 + /* 1900 + * For subvolume trees, owners can mismatch, but they should all belong 1901 + * to subvolume trees. 1902 + */ 1903 + if (unlikely(is_subvol != is_fstree(eb_owner))) { 1904 + btrfs_crit(eb->fs_info, 1905 + "corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", 1906 + btrfs_header_level(eb) == 0 ? "leaf" : "node", 1907 + root_owner, btrfs_header_bytenr(eb), eb_owner, 1908 + BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); 1909 + return -EUCLEAN; 1910 + } 1911 + return 0; 1912 + }

+1

fs/btrfs/tree-checker.h

··· 25 25 26 26 int btrfs_check_chunk_valid(struct extent_buffer *leaf, 27 27 struct btrfs_chunk *chunk, u64 logical); 28 + int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); 28 29 29 30 #endif

+5 -6

fs/btrfs/tree-log.c

··· 333 333 * pin down any logged extents, so we have to read the block. 334 334 */ 335 335 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 336 - ret = btrfs_read_buffer(eb, gen, level, NULL); 336 + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 337 337 if (ret) 338 338 return ret; 339 339 } ··· 894 894 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 895 895 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 896 896 out: 897 - if (inode) 898 - iput(inode); 897 + iput(inode); 899 898 return ret; 900 899 } 901 900 ··· 2574 2575 int i; 2575 2576 int ret; 2576 2577 2577 - ret = btrfs_read_buffer(eb, gen, level, NULL); 2578 + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 2578 2579 if (ret) 2579 2580 return ret; 2580 2581 ··· 2785 2786 2786 2787 path->slots[*level]++; 2787 2788 if (wc->free) { 2788 - ret = btrfs_read_buffer(next, ptr_gen, 2789 + ret = btrfs_read_extent_buffer(next, ptr_gen, 2789 2790 *level - 1, &first_key); 2790 2791 if (ret) { 2791 2792 free_extent_buffer(next); ··· 2814 2815 free_extent_buffer(next); 2815 2816 continue; 2816 2817 } 2817 - ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2818 + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); 2818 2819 if (ret) { 2819 2820 free_extent_buffer(next); 2820 2821 return ret;

+40 -85

fs/btrfs/volumes.c

··· 164 164 */ 165 165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 166 166 { 167 - if (flags & BTRFS_BLOCK_GROUP_RAID10) 168 - return BTRFS_RAID_RAID10; 169 - else if (flags & BTRFS_BLOCK_GROUP_RAID1) 170 - return BTRFS_RAID_RAID1; 171 - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 172 - return BTRFS_RAID_RAID1C3; 173 - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 174 - return BTRFS_RAID_RAID1C4; 175 - else if (flags & BTRFS_BLOCK_GROUP_DUP) 176 - return BTRFS_RAID_DUP; 177 - else if (flags & BTRFS_BLOCK_GROUP_RAID0) 178 - return BTRFS_RAID_RAID0; 179 - else if (flags & BTRFS_BLOCK_GROUP_RAID5) 180 - return BTRFS_RAID_RAID5; 181 - else if (flags & BTRFS_BLOCK_GROUP_RAID6) 182 - return BTRFS_RAID_RAID6; 167 + const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); 183 168 184 - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 169 + if (!profile) 170 + return BTRFS_RAID_SINGLE; 171 + 172 + return BTRFS_BG_FLAG_TO_INDEX(profile); 185 173 } 186 174 187 175 const char *btrfs_bg_type_to_raid_name(u64 flags) ··· 4050 4062 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4051 4063 return true; 4052 4064 4053 - if (fs_info->sectorsize < PAGE_SIZE && 4054 - bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 4055 - btrfs_err(fs_info, 4056 - "RAID56 is not yet supported for sectorsize %u with page size %lu", 4057 - fs_info->sectorsize, PAGE_SIZE); 4058 - return false; 4059 - } 4060 4065 /* Profile is valid and does not have bits outside of the allowed set */ 4061 4066 if (alloc_profile_is_valid(bargs->target, 1) && 4062 4067 (bargs->target & ~allowed) == 0) ··· 6293 6312 u64 offset; 6294 6313 u64 stripe_offset; 6295 6314 u64 stripe_nr; 6296 - u64 stripe_len; 6315 + u32 stripe_len; 6297 6316 u64 raid56_full_stripe_start = (u64)-1; 6298 6317 int data_stripes; 6299 6318 ··· 6304 6323 offset = logical - em->start; 6305 6324 /* Len of a stripe in a chunk */ 6306 6325 stripe_len = map->stripe_len; 6307 - /* Stripe where this block falls in */ 6308 - stripe_nr = div64_u64(offset, stripe_len); 6309 - /* Offset of stripe in the chunk */ 6310 - stripe_offset = stripe_nr * stripe_len; 6311 - if (offset < stripe_offset) { 6312 - btrfs_crit(fs_info, 6313 - "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6314 - stripe_offset, offset, em->start, logical, stripe_len); 6315 - return -EINVAL; 6316 - } 6326 + /* 6327 + * Stripe_nr is where this block falls in 6328 + * stripe_offset is the offset of this block in its stripe. 6329 + */ 6330 + stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); 6331 + ASSERT(stripe_offset < U32_MAX); 6317 6332 6318 - /* stripe_offset is the offset of this block in its stripe */ 6319 - stripe_offset = offset - stripe_offset; 6320 6333 data_stripes = nr_data_stripes(map); 6321 6334 6322 6335 /* Only stripe based profiles needs to check against stripe length. */ ··· 6712 6737 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6713 6738 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6714 6739 dev->devid, bio->bi_iter.bi_size); 6715 - bio_set_dev(bio, dev->bdev); 6716 6740 6717 6741 btrfs_bio_counter_inc_noblocked(fs_info); 6718 6742 6719 - btrfsic_submit_bio(bio); 6743 + btrfsic_check_bio(bio); 6744 + submit_bio(bio); 6720 6745 } 6721 6746 6722 6747 static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) ··· 6798 6823 continue; 6799 6824 } 6800 6825 6801 - if (dev_nr < total_devs - 1) 6802 - bio = btrfs_bio_clone(first_bio); 6803 - else 6826 + if (dev_nr < total_devs - 1) { 6827 + bio = btrfs_bio_clone(dev->bdev, first_bio); 6828 + } else { 6804 6829 bio = first_bio; 6830 + bio_set_dev(bio, dev->bdev); 6831 + } 6805 6832 6806 6833 submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); 6807 6834 } ··· 7336 7359 7337 7360 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7338 7361 { 7339 - struct btrfs_root *root = fs_info->tree_root; 7340 7362 struct btrfs_super_block *super_copy = fs_info->super_copy; 7341 7363 struct extent_buffer *sb; 7342 7364 struct btrfs_disk_key *disk_key; ··· 7351 7375 struct btrfs_key key; 7352 7376 7353 7377 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7378 + 7354 7379 /* 7355 - * This will create extent buffer of nodesize, superblock size is 7356 - * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7357 - * overallocate but we can keep it as-is, only the first page is used. 7380 + * We allocated a dummy extent, just to use extent buffer accessors. 7381 + * There will be unused space after BTRFS_SUPER_INFO_SIZE, but 7382 + * that's fine, we will not go beyond system chunk array anyway. 7358 7383 */ 7359 - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7360 - root->root_key.objectid, 0); 7361 - if (IS_ERR(sb)) 7362 - return PTR_ERR(sb); 7384 + sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); 7385 + if (!sb) 7386 + return -ENOMEM; 7363 7387 set_extent_buffer_uptodate(sb); 7364 - /* 7365 - * The sb extent buffer is artificial and just used to read the system array. 7366 - * set_extent_buffer_uptodate() call does not properly mark all it's 7367 - * pages up-to-date when the page is larger: extent does not cover the 7368 - * whole page and consequently check_page_uptodate does not find all 7369 - * the page's extents up-to-date (the hole beyond sb), 7370 - * write_extent_buffer then triggers a WARN_ON. 7371 - * 7372 - * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7373 - * but sb spans only this function. Add an explicit SetPageUptodate call 7374 - * to silence the warning eg. on PowerPC 64. 7375 - */ 7376 - if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7377 - SetPageUptodate(sb->pages[0]); 7378 7388 7379 7389 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7380 7390 array_size = btrfs_super_sys_array_size(super_copy); ··· 7523 7561 struct btrfs_key found_key; 7524 7562 int ret; 7525 7563 int slot; 7564 + int iter_ret = 0; 7526 7565 u64 total_dev = 0; 7527 7566 u64 last_ra_node = 0; 7528 7567 ··· 7567 7604 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7568 7605 key.offset = 0; 7569 7606 key.type = 0; 7570 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7571 - if (ret < 0) 7572 - goto error; 7573 - while (1) { 7574 - struct extent_buffer *node; 7607 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 7608 + struct extent_buffer *node = path->nodes[1]; 7575 7609 7576 7610 leaf = path->nodes[0]; 7577 7611 slot = path->slots[0]; 7578 - if (slot >= btrfs_header_nritems(leaf)) { 7579 - ret = btrfs_next_leaf(root, path); 7580 - if (ret == 0) 7581 - continue; 7582 - if (ret < 0) 7583 - goto error; 7584 - break; 7585 - } 7586 - node = path->nodes[1]; 7612 + 7587 7613 if (node) { 7588 7614 if (last_ra_node != node->start) { 7589 7615 readahead_tree_node_children(node); 7590 7616 last_ra_node = node->start; 7591 7617 } 7592 7618 } 7593 - btrfs_item_key_to_cpu(leaf, &found_key, slot); 7594 7619 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7595 7620 struct btrfs_dev_item *dev_item; 7596 7621 dev_item = btrfs_item_ptr(leaf, slot, ··· 7603 7652 if (ret) 7604 7653 goto error; 7605 7654 } 7606 - path->slots[0]++; 7655 + } 7656 + /* Catch error found during iteration */ 7657 + if (iter_ret < 0) { 7658 + ret = iter_ret; 7659 + goto error; 7607 7660 } 7608 7661 7609 7662 /* ··· 7615 7660 * do another round of validation checks. 7616 7661 */ 7617 7662 if (total_dev != fs_info->fs_devices->total_devices) { 7618 - btrfs_err(fs_info, 7619 - "super_num_devices %llu mismatch with num_devices %llu found here", 7663 + btrfs_warn(fs_info, 7664 + "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", 7620 7665 btrfs_super_num_devices(fs_info->super_copy), 7621 7666 total_dev); 7622 - ret = -EINVAL; 7623 - goto error; 7667 + fs_info->fs_devices->total_devices = total_dev; 7668 + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); 7624 7669 } 7625 7670 if (btrfs_super_total_bytes(fs_info->super_copy) < 7626 7671 fs_info->fs_devices->total_rw_bytes) { ··· 8232 8277 8233 8278 static int relocating_repair_kthread(void *data) 8234 8279 { 8235 - struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8280 + struct btrfs_block_group *cache = data; 8236 8281 struct btrfs_fs_info *fs_info = cache->fs_info; 8237 8282 u64 target; 8238 8283 int ret = 0;

+38 -4

fs/btrfs/volumes.h

··· 17 17 18 18 #define BTRFS_STRIPE_LEN SZ_64K 19 19 20 + /* Used by sanity check for btrfs_raid_types. */ 21 + #define const_ffs(n) (__builtin_ctzll(n) + 1) 22 + 23 + /* 24 + * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires 25 + * RAID0 always to be the lowest profile bit. 26 + * Although it's part of on-disk format and should never change, do extra 27 + * compile-time sanity checks. 28 + */ 29 + static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < 30 + const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); 31 + static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > 32 + ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); 33 + 34 + /* ilog2() can handle both constants and variables */ 35 + #define BTRFS_BG_FLAG_TO_INDEX(profile) \ 36 + ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1)) 37 + 38 + enum btrfs_raid_types { 39 + /* SINGLE is the special one as it doesn't have on-disk bit. */ 40 + BTRFS_RAID_SINGLE = 0, 41 + 42 + BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0), 43 + BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1), 44 + BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP), 45 + BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10), 46 + BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5), 47 + BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6), 48 + BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3), 49 + BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4), 50 + 51 + BTRFS_NR_RAID_TYPES 52 + }; 53 + 20 54 struct btrfs_io_geometry { 21 55 /* remaining bytes before crossing a stripe */ 22 56 u64 len; 23 57 /* offset of logical address in chunk */ 24 58 u64 offset; 25 59 /* length of single IO stripe */ 26 - u64 stripe_len; 60 + u32 stripe_len; 61 + /* offset of address in stripe */ 62 + u32 stripe_offset; 27 63 /* number of stripe where address falls */ 28 64 u64 stripe_nr; 29 - /* offset of address in stripe */ 30 - u64 stripe_offset; 31 65 /* offset of raid56 stripe into the chunk */ 32 66 u64 raid56_stripe_offset; 33 67 }; ··· 464 430 u64 type; 465 431 int io_align; 466 432 int io_width; 467 - u64 stripe_len; 433 + u32 stripe_len; 468 434 int num_stripes; 469 435 int sub_stripes; 470 436 int verified_stripes; /* For mount time dev extent verification */

+11 -29

fs/btrfs/xattr.c

··· 272 272 273 273 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) 274 274 { 275 + struct btrfs_key found_key; 275 276 struct btrfs_key key; 276 277 struct inode *inode = d_inode(dentry); 277 278 struct btrfs_root *root = BTRFS_I(inode)->root; 278 279 struct btrfs_path *path; 280 + int iter_ret = 0; 279 281 int ret = 0; 280 282 size_t total_size = 0, size_left = size; 281 283 ··· 296 294 path->reada = READA_FORWARD; 297 295 298 296 /* search for our xattrs */ 299 - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 300 - if (ret < 0) 301 - goto err; 302 - 303 - while (1) { 297 + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 304 298 struct extent_buffer *leaf; 305 299 int slot; 306 300 struct btrfs_dir_item *di; 307 - struct btrfs_key found_key; 308 301 u32 item_size; 309 302 u32 cur; 310 303 311 304 leaf = path->nodes[0]; 312 305 slot = path->slots[0]; 313 - 314 - /* this is where we start walking through the path */ 315 - if (slot >= btrfs_header_nritems(leaf)) { 316 - /* 317 - * if we've reached the last slot in this leaf we need 318 - * to go to the next leaf and reset everything 319 - */ 320 - ret = btrfs_next_leaf(root, path); 321 - if (ret < 0) 322 - goto err; 323 - else if (ret > 0) 324 - break; 325 - continue; 326 - } 327 - 328 - btrfs_item_key_to_cpu(leaf, &found_key, slot); 329 306 330 307 /* check to make sure this item is what we want */ 331 308 if (found_key.objectid != key.objectid) ··· 312 331 if (found_key.type > BTRFS_XATTR_ITEM_KEY) 313 332 break; 314 333 if (found_key.type < BTRFS_XATTR_ITEM_KEY) 315 - goto next_item; 334 + continue; 316 335 317 336 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 318 337 item_size = btrfs_item_size(leaf, slot); ··· 332 351 goto next; 333 352 334 353 if (!buffer || (name_len + 1) > size_left) { 335 - ret = -ERANGE; 336 - goto err; 354 + iter_ret = -ERANGE; 355 + break; 337 356 } 338 357 339 358 read_extent_buffer(leaf, buffer, name_ptr, name_len); ··· 345 364 cur += this_len; 346 365 di = (struct btrfs_dir_item *)((char *)di + this_len); 347 366 } 348 - next_item: 349 - path->slots[0]++; 350 367 } 351 - ret = total_size; 352 368 353 - err: 369 + if (iter_ret < 0) 370 + ret = iter_ret; 371 + else 372 + ret = total_size; 373 + 354 374 btrfs_free_path(path); 355 375 356 376 return ret;

+146 -79

fs/btrfs/zoned.c

··· 51 51 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) 52 52 53 53 /* 54 - * Maximum supported zone size. Currently, SMR disks have a zone size of 55 - * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not 56 - * expect the zone size to become larger than 8GiB in the near future. 54 + * Minimum / maximum supported zone size. Currently, SMR disks have a zone 55 + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. 56 + * We do not expect the zone size to become larger than 8GiB or smaller than 57 + * 4MiB in the near future. 57 58 */ 58 59 #define BTRFS_MAX_ZONE_SIZE SZ_8G 60 + #define BTRFS_MIN_ZONE_SIZE SZ_4M 59 61 60 62 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) 61 63 ··· 401 399 "zoned: %s: zone size %llu larger than supported maximum %llu", 402 400 rcu_str_deref(device->name), 403 401 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 402 + ret = -EINVAL; 403 + goto out; 404 + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { 405 + btrfs_err_in_rcu(fs_info, 406 + "zoned: %s: zone size %llu smaller than supported minimum %u", 407 + rcu_str_deref(device->name), 408 + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); 404 409 ret = -EINVAL; 405 410 goto out; 406 411 } ··· 1844 1835 } 1845 1836 1846 1837 /* No space left */ 1847 - if (block_group->alloc_offset == block_group->zone_capacity) { 1838 + if (btrfs_zoned_bg_is_full(block_group)) { 1848 1839 ret = false; 1849 1840 goto out_unlock; 1850 1841 } ··· 1881 1872 return ret; 1882 1873 } 1883 1874 1884 - int btrfs_zone_finish(struct btrfs_block_group *block_group) 1875 + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) 1885 1876 { 1886 1877 struct btrfs_fs_info *fs_info = block_group->fs_info; 1887 1878 struct map_lookup *map; 1888 - struct btrfs_device *device; 1889 - u64 physical; 1879 + bool need_zone_finish; 1890 1880 int ret = 0; 1891 1881 int i; 1892 - 1893 - if (!btrfs_is_zoned(fs_info)) 1894 - return 0; 1895 - 1896 - map = block_group->physical_map; 1897 1882 1898 1883 spin_lock(&block_group->lock); 1899 1884 if (!block_group->zone_is_active) { ··· 1898 1895 /* Check if we have unwritten allocated space */ 1899 1896 if ((block_group->flags & 1900 1897 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && 1901 - block_group->alloc_offset > block_group->meta_write_pointer) { 1898 + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { 1902 1899 spin_unlock(&block_group->lock); 1903 1900 return -EAGAIN; 1904 1901 } 1905 - spin_unlock(&block_group->lock); 1906 - 1907 - ret = btrfs_inc_block_group_ro(block_group, false); 1908 - if (ret) 1909 - return ret; 1910 - 1911 - /* Ensure all writes in this block group finish */ 1912 - btrfs_wait_block_group_reservations(block_group); 1913 - /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ 1914 - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, 1915 - block_group->length); 1916 - 1917 - spin_lock(&block_group->lock); 1918 1902 1919 1903 /* 1920 - * Bail out if someone already deactivated the block group, or 1921 - * allocated space is left in the block group. 1904 + * If we are sure that the block group is full (= no more room left for 1905 + * new allocation) and the IO for the last usable block is completed, we 1906 + * don't need to wait for the other IOs. This holds because we ensure 1907 + * the sequential IO submissions using the ZONE_APPEND command for data 1908 + * and block_group->meta_write_pointer for metadata. 1922 1909 */ 1923 - if (!block_group->zone_is_active) { 1910 + if (!fully_written) { 1924 1911 spin_unlock(&block_group->lock); 1925 - btrfs_dec_block_group_ro(block_group); 1926 - return 0; 1912 + 1913 + ret = btrfs_inc_block_group_ro(block_group, false); 1914 + if (ret) 1915 + return ret; 1916 + 1917 + /* Ensure all writes in this block group finish */ 1918 + btrfs_wait_block_group_reservations(block_group); 1919 + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ 1920 + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, 1921 + block_group->length); 1922 + 1923 + spin_lock(&block_group->lock); 1924 + 1925 + /* 1926 + * Bail out if someone already deactivated the block group, or 1927 + * allocated space is left in the block group. 1928 + */ 1929 + if (!block_group->zone_is_active) { 1930 + spin_unlock(&block_group->lock); 1931 + btrfs_dec_block_group_ro(block_group); 1932 + return 0; 1933 + } 1934 + 1935 + if (block_group->reserved) { 1936 + spin_unlock(&block_group->lock); 1937 + btrfs_dec_block_group_ro(block_group); 1938 + return -EAGAIN; 1939 + } 1927 1940 } 1928 1941 1929 - if (block_group->reserved) { 1930 - spin_unlock(&block_group->lock); 1931 - btrfs_dec_block_group_ro(block_group); 1932 - return -EAGAIN; 1933 - } 1942 + /* 1943 + * The block group is not fully allocated, so not fully written yet. We 1944 + * need to send ZONE_FINISH command to free up an active zone. 1945 + */ 1946 + need_zone_finish = !btrfs_zoned_bg_is_full(block_group); 1934 1947 1935 1948 block_group->zone_is_active = 0; 1936 1949 block_group->alloc_offset = block_group->zone_capacity; ··· 1955 1936 btrfs_clear_data_reloc_bg(block_group); 1956 1937 spin_unlock(&block_group->lock); 1957 1938 1939 + map = block_group->physical_map; 1958 1940 for (i = 0; i < map->num_stripes; i++) { 1959 - device = map->stripes[i].dev; 1960 - physical = map->stripes[i].physical; 1941 + struct btrfs_device *device = map->stripes[i].dev; 1942 + const u64 physical = map->stripes[i].physical; 1961 1943 1962 1944 if (device->zone_info->max_active_zones == 0) 1963 1945 continue; 1964 1946 1965 - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 1966 - physical >> SECTOR_SHIFT, 1967 - device->zone_info->zone_size >> SECTOR_SHIFT, 1968 - GFP_NOFS); 1947 + if (need_zone_finish) { 1948 + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 1949 + physical >> SECTOR_SHIFT, 1950 + device->zone_info->zone_size >> SECTOR_SHIFT, 1951 + GFP_NOFS); 1969 1952 1970 - if (ret) 1971 - return ret; 1953 + if (ret) 1954 + return ret; 1955 + } 1972 1956 1973 1957 btrfs_dev_clear_active_zone(device, physical); 1974 1958 } 1975 - btrfs_dec_block_group_ro(block_group); 1959 + 1960 + if (!fully_written) 1961 + btrfs_dec_block_group_ro(block_group); 1976 1962 1977 1963 spin_lock(&fs_info->zone_active_bgs_lock); 1978 1964 ASSERT(!list_empty(&block_group->active_bg_list)); ··· 1988 1964 btrfs_put_block_group(block_group); 1989 1965 1990 1966 return 0; 1967 + } 1968 + 1969 + int btrfs_zone_finish(struct btrfs_block_group *block_group) 1970 + { 1971 + if (!btrfs_is_zoned(block_group->fs_info)) 1972 + return 0; 1973 + 1974 + return do_zone_finish(block_group, false); 1991 1975 } 1992 1976 1993 1977 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) ··· 2029 1997 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) 2030 1998 { 2031 1999 struct btrfs_block_group *block_group; 2032 - struct map_lookup *map; 2033 - struct btrfs_device *device; 2034 - u64 physical; 2000 + u64 min_alloc_bytes; 2035 2001 2036 2002 if (!btrfs_is_zoned(fs_info)) 2037 2003 return; ··· 2037 2007 block_group = btrfs_lookup_block_group(fs_info, logical); 2038 2008 ASSERT(block_group); 2039 2009 2040 - if (logical + length < block_group->start + block_group->zone_capacity) 2010 + /* No MIXED_BG on zoned btrfs. */ 2011 + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 2012 + min_alloc_bytes = fs_info->sectorsize; 2013 + else 2014 + min_alloc_bytes = fs_info->nodesize; 2015 + 2016 + /* Bail out if we can allocate more data from this block group. */ 2017 + if (logical + length + min_alloc_bytes <= 2018 + block_group->start + block_group->zone_capacity) 2041 2019 goto out; 2042 2020 2043 - spin_lock(&block_group->lock); 2044 - 2045 - if (!block_group->zone_is_active) { 2046 - spin_unlock(&block_group->lock); 2047 - goto out; 2048 - } 2049 - 2050 - block_group->zone_is_active = 0; 2051 - /* We should have consumed all the free space */ 2052 - ASSERT(block_group->alloc_offset == block_group->zone_capacity); 2053 - ASSERT(block_group->free_space_ctl->free_space == 0); 2054 - btrfs_clear_treelog_bg(block_group); 2055 - btrfs_clear_data_reloc_bg(block_group); 2056 - spin_unlock(&block_group->lock); 2057 - 2058 - map = block_group->physical_map; 2059 - device = map->stripes[0].dev; 2060 - physical = map->stripes[0].physical; 2061 - 2062 - if (!device->zone_info->max_active_zones) 2063 - goto out; 2064 - 2065 - btrfs_dev_clear_active_zone(device, physical); 2066 - 2067 - spin_lock(&fs_info->zone_active_bgs_lock); 2068 - ASSERT(!list_empty(&block_group->active_bg_list)); 2069 - list_del_init(&block_group->active_bg_list); 2070 - spin_unlock(&fs_info->zone_active_bgs_lock); 2071 - 2072 - btrfs_put_block_group(block_group); 2021 + do_zone_finish(block_group, true); 2073 2022 2074 2023 out: 2075 2024 btrfs_put_block_group(block_group); 2025 + } 2026 + 2027 + static void btrfs_zone_finish_endio_workfn(struct work_struct *work) 2028 + { 2029 + struct btrfs_block_group *bg = 2030 + container_of(work, struct btrfs_block_group, zone_finish_work); 2031 + 2032 + wait_on_extent_buffer_writeback(bg->last_eb); 2033 + free_extent_buffer(bg->last_eb); 2034 + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); 2035 + btrfs_put_block_group(bg); 2036 + } 2037 + 2038 + void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 2039 + struct extent_buffer *eb) 2040 + { 2041 + if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) 2042 + return; 2043 + 2044 + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { 2045 + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", 2046 + bg->start); 2047 + return; 2048 + } 2049 + 2050 + /* For the work */ 2051 + btrfs_get_block_group(bg); 2052 + atomic_inc(&eb->refs); 2053 + bg->last_eb = eb; 2054 + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); 2055 + queue_work(system_unbound_wq, &bg->zone_finish_work); 2076 2056 } 2077 2057 2078 2058 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) ··· 2111 2071 } 2112 2072 } 2113 2073 mutex_unlock(&fs_devices->device_list_mutex); 2074 + } 2075 + 2076 + bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) 2077 + { 2078 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2079 + struct btrfs_device *device; 2080 + u64 used = 0; 2081 + u64 total = 0; 2082 + u64 factor; 2083 + 2084 + ASSERT(btrfs_is_zoned(fs_info)); 2085 + 2086 + if (fs_info->bg_reclaim_threshold == 0) 2087 + return false; 2088 + 2089 + mutex_lock(&fs_devices->device_list_mutex); 2090 + list_for_each_entry(device, &fs_devices->devices, dev_list) { 2091 + if (!device->bdev) 2092 + continue; 2093 + 2094 + total += device->disk_total_bytes; 2095 + used += device->bytes_used; 2096 + } 2097 + mutex_unlock(&fs_devices->device_list_mutex); 2098 + 2099 + factor = div64_u64(used * 100, total); 2100 + return factor >= fs_info->bg_reclaim_threshold; 2114 2101 }

+18 -5

fs/btrfs/zoned.h

··· 10 10 #include "block-group.h" 11 11 #include "btrfs_inode.h" 12 12 13 - /* 14 - * Block groups with more than this value (percents) of unusable space will be 15 - * scheduled for background reclaim. 16 - */ 17 - #define BTRFS_DEFAULT_RECLAIM_THRESH 75 13 + #define BTRFS_DEFAULT_RECLAIM_THRESH (75) 18 14 19 15 struct btrfs_zoned_device_info { 20 16 /* ··· 72 76 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); 73 77 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, 74 78 u64 length); 79 + void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 80 + struct extent_buffer *eb); 75 81 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); 76 82 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); 83 + bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); 77 84 #else /* CONFIG_BLK_DEV_ZONED */ 78 85 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 79 86 struct blk_zone *zone) ··· 232 233 static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, 233 234 u64 logical, u64 length) { } 234 235 236 + static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 237 + struct extent_buffer *eb) { } 238 + 235 239 static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } 236 240 237 241 static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } 242 + 243 + static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) 244 + { 245 + return false; 246 + } 238 247 #endif 239 248 240 249 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) ··· 375 368 376 369 if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info)) 377 370 mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); 371 + } 372 + 373 + static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) 374 + { 375 + ASSERT(btrfs_is_zoned(bg->fs_info)); 376 + return (bg->alloc_offset == bg->zone_capacity); 378 377 } 379 378 380 379 #endif

+9 -5

fs/btrfs/zstd.c

··· 93 93 94 94 void zstd_free_workspace(struct list_head *ws); 95 95 struct list_head *zstd_alloc_workspace(unsigned int level); 96 - /* 97 - * zstd_reclaim_timer_fn - reclaim timer 96 + 97 + /** 98 + * Timer callback to free unused workspaces. 99 + * 98 100 * @t: timer 99 101 * 100 102 * This scans the lru_list and attempts to reclaim any workspace that hasn't 101 103 * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. 104 + * 105 + * The context is softirq and does not need the _bh locking primitives. 102 106 */ 103 107 static void zstd_reclaim_timer_fn(struct timer_list *timer) 104 108 { 105 109 unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; 106 110 struct list_head *pos, *next; 107 111 108 - spin_lock_bh(&wsm.lock); 112 + spin_lock(&wsm.lock); 109 113 110 114 if (list_empty(&wsm.lru_list)) { 111 - spin_unlock_bh(&wsm.lock); 115 + spin_unlock(&wsm.lock); 112 116 return; 113 117 } 114 118 ··· 141 137 if (!list_empty(&wsm.lru_list)) 142 138 mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); 143 139 144 - spin_unlock_bh(&wsm.lock); 140 + spin_unlock(&wsm.lock); 145 141 } 146 142 147 143 /*

+1 -1

fs/erofs/data.c

··· 399 399 400 400 if (!err) 401 401 return iomap_dio_rw(iocb, to, &erofs_iomap_ops, 402 - NULL, 0, 0); 402 + NULL, 0, NULL, 0); 403 403 if (err < 0) 404 404 return err; 405 405 }

+2 -2

fs/ext4/file.c

··· 76 76 return generic_file_read_iter(iocb, to); 77 77 } 78 78 79 - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); 79 + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); 80 80 inode_unlock_shared(inode); 81 81 82 82 file_accessed(iocb->ki_filp); ··· 565 565 iomap_ops = &ext4_iomap_overwrite_ops; 566 566 ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, 567 567 (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, 568 - 0); 568 + NULL, 0); 569 569 if (ret == -ENOTBLK) 570 570 ret = 0; 571 571

+2 -2

fs/f2fs/file.c

··· 4308 4308 */ 4309 4309 inc_page_count(sbi, F2FS_DIO_READ); 4310 4310 dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, 4311 - &f2fs_iomap_dio_read_ops, 0, 0); 4311 + &f2fs_iomap_dio_read_ops, 0, NULL, 0); 4312 4312 if (IS_ERR_OR_NULL(dio)) { 4313 4313 ret = PTR_ERR_OR_ZERO(dio); 4314 4314 if (ret != -EIOCBQUEUED) ··· 4526 4526 if (pos + count > inode->i_size) 4527 4527 dio_flags |= IOMAP_DIO_FORCE_WAIT; 4528 4528 dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, 4529 - &f2fs_iomap_dio_write_ops, dio_flags, 0); 4529 + &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0); 4530 4530 if (IS_ERR_OR_NULL(dio)) { 4531 4531 ret = PTR_ERR_OR_ZERO(dio); 4532 4532 if (ret == -ENOTBLK)

+2 -2

fs/gfs2/file.c

··· 835 835 pagefault_disable(); 836 836 to->nofault = true; 837 837 ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 838 - IOMAP_DIO_PARTIAL, read); 838 + IOMAP_DIO_PARTIAL, NULL, read); 839 839 to->nofault = false; 840 840 pagefault_enable(); 841 841 if (ret <= 0 && ret != -EFAULT) ··· 898 898 899 899 from->nofault = true; 900 900 ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 901 - IOMAP_DIO_PARTIAL, written); 901 + IOMAP_DIO_PARTIAL, NULL, written); 902 902 from->nofault = false; 903 903 if (ret <= 0) { 904 904 if (ret == -ENOTBLK)

+18 -7

fs/iomap/direct-io.c

··· 51 51 }; 52 52 }; 53 53 54 + static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, 55 + struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf) 56 + { 57 + if (dio->dops && dio->dops->bio_set) 58 + return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf, 59 + GFP_KERNEL, dio->dops->bio_set); 60 + return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL); 61 + } 62 + 54 63 static void iomap_dio_submit_bio(const struct iomap_iter *iter, 55 64 struct iomap_dio *dio, struct bio *bio, loff_t pos) 56 65 { ··· 154 145 cmpxchg(&dio->error, 0, ret); 155 146 } 156 147 157 - static void iomap_dio_bio_end_io(struct bio *bio) 148 + void iomap_dio_bio_end_io(struct bio *bio) 158 149 { 159 150 struct iomap_dio *dio = bio->bi_private; 160 151 bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); ··· 186 177 bio_put(bio); 187 178 } 188 179 } 180 + EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); 189 181 190 182 static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, 191 183 loff_t pos, unsigned len) 192 184 { 193 185 struct inode *inode = file_inode(dio->iocb->ki_filp); 194 186 struct page *page = ZERO_PAGE(0); 195 - int flags = REQ_SYNC | REQ_IDLE; 196 187 struct bio *bio; 197 188 198 - bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL); 189 + bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); 199 190 fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, 200 191 GFP_KERNEL); 201 192 bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); ··· 320 311 goto out; 321 312 } 322 313 323 - bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL); 314 + bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); 324 315 fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, 325 316 GFP_KERNEL); 326 317 bio->bi_iter.bi_sector = iomap_sector(iomap, pos); ··· 483 474 struct iomap_dio * 484 475 __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 485 476 const struct iomap_ops *ops, const struct iomap_dio_ops *dops, 486 - unsigned int dio_flags, size_t done_before) 477 + unsigned int dio_flags, void *private, size_t done_before) 487 478 { 488 479 struct address_space *mapping = iocb->ki_filp->f_mapping; 489 480 struct inode *inode = file_inode(iocb->ki_filp); ··· 492 483 .pos = iocb->ki_pos, 493 484 .len = iov_iter_count(iter), 494 485 .flags = IOMAP_DIRECT, 486 + .private = private, 495 487 }; 496 488 loff_t end = iomi.pos + iomi.len - 1, ret = 0; 497 489 bool wait_for_completion = ··· 682 672 ssize_t 683 673 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 684 674 const struct iomap_ops *ops, const struct iomap_dio_ops *dops, 685 - unsigned int dio_flags, size_t done_before) 675 + unsigned int dio_flags, void *private, size_t done_before) 686 676 { 687 677 struct iomap_dio *dio; 688 678 689 - dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); 679 + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private, 680 + done_before); 690 681 if (IS_ERR_OR_NULL(dio)) 691 682 return PTR_ERR_OR_ZERO(dio); 692 683 return iomap_dio_complete(dio);

+3 -3

fs/xfs/xfs_file.c

··· 225 225 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); 226 226 if (ret) 227 227 return ret; 228 - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); 228 + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); 229 229 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 230 230 231 231 return ret; ··· 534 534 } 535 535 trace_xfs_file_direct_write(iocb, from); 536 536 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 537 - &xfs_dio_write_ops, 0, 0); 537 + &xfs_dio_write_ops, 0, NULL, 0); 538 538 out_unlock: 539 539 if (iolock) 540 540 xfs_iunlock(ip, iolock); ··· 612 612 613 613 trace_xfs_file_direct_write(iocb, from); 614 614 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, 615 - &xfs_dio_write_ops, flags, 0); 615 + &xfs_dio_write_ops, flags, NULL, 0); 616 616 617 617 /* 618 618 * Retry unaligned I/O with exclusive blocking semantics if the DIO

+2 -2

fs/zonefs/super.c

··· 900 900 ret = zonefs_file_dio_append(iocb, from); 901 901 else 902 902 ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, 903 - &zonefs_write_dio_ops, 0, 0); 903 + &zonefs_write_dio_ops, 0, NULL, 0); 904 904 if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && 905 905 (ret > 0 || ret == -EIOCBQUEUED)) { 906 906 if (ret > 0) ··· 1042 1042 } 1043 1043 file_accessed(iocb->ki_filp); 1044 1044 ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, 1045 - &zonefs_read_dio_ops, 0, 0); 1045 + &zonefs_read_dio_ops, 0, NULL, 0); 1046 1046 } else { 1047 1047 ret = generic_file_read_iter(iocb, to); 1048 1048 if (ret == -EIO)

+5

include/linux/fs.h

··· 1708 1708 #define __sb_writers_release(sb, lev) \ 1709 1709 percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) 1710 1710 1711 + static inline bool sb_write_started(const struct super_block *sb) 1712 + { 1713 + return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); 1714 + } 1715 + 1711 1716 /** 1712 1717 * sb_end_write - drop write access to a superblock 1713 1718 * @sb: the super we wrote to

+14 -2

include/linux/iomap.h

··· 188 188 unsigned flags; 189 189 struct iomap iomap; 190 190 struct iomap srcmap; 191 + void *private; 191 192 }; 192 193 193 194 int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); ··· 321 320 unsigned flags); 322 321 void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, 323 322 loff_t file_offset); 323 + 324 + /* 325 + * Filesystems wishing to attach private information to a direct io bio 326 + * must provide a ->submit_io method that attaches the additional 327 + * information to the bio and changes the ->bi_end_io callback to a 328 + * custom function. This function should, at a minimum, perform any 329 + * relevant post-processing of the bio and end with a call to 330 + * iomap_dio_bio_end_io. 331 + */ 332 + struct bio_set *bio_set; 324 333 }; 325 334 326 335 /* ··· 355 344 356 345 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 357 346 const struct iomap_ops *ops, const struct iomap_dio_ops *dops, 358 - unsigned int dio_flags, size_t done_before); 347 + unsigned int dio_flags, void *private, size_t done_before); 359 348 struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 360 349 const struct iomap_ops *ops, const struct iomap_dio_ops *dops, 361 - unsigned int dio_flags, size_t done_before); 350 + unsigned int dio_flags, void *private, size_t done_before); 362 351 ssize_t iomap_dio_complete(struct iomap_dio *dio); 352 + void iomap_dio_bio_end_io(struct bio *bio); 363 353 364 354 #ifdef CONFIG_SWAP 365 355 struct file;

+12 -18

include/trace/events/btrfs.h

··· 24 24 struct map_lookup; 25 25 struct extent_buffer; 26 26 struct btrfs_work; 27 - struct __btrfs_workqueue; 27 + struct btrfs_workqueue; 28 28 struct btrfs_qgroup_extent_record; 29 29 struct btrfs_qgroup; 30 30 struct extent_io_tree; ··· 1457 1457 TP_ARGS(work) 1458 1458 ); 1459 1459 1460 - DECLARE_EVENT_CLASS(btrfs__workqueue, 1460 + DECLARE_EVENT_CLASS(btrfs_workqueue, 1461 1461 1462 - TP_PROTO(const struct __btrfs_workqueue *wq, 1463 - const char *name, int high), 1462 + TP_PROTO(const struct btrfs_workqueue *wq, const char *name), 1464 1463 1465 - TP_ARGS(wq, name, high), 1464 + TP_ARGS(wq, name), 1466 1465 1467 1466 TP_STRUCT__entry_btrfs( 1468 1467 __field( const void *, wq ) 1469 1468 __string( name, name ) 1470 - __field( int , high ) 1471 1469 ), 1472 1470 1473 1471 TP_fast_assign_btrfs(btrfs_workqueue_owner(wq), 1474 1472 __entry->wq = wq; 1475 1473 __assign_str(name, name); 1476 - __entry->high = high; 1477 1474 ), 1478 1475 1479 - TP_printk_btrfs("name=%s%s wq=%p", __get_str(name), 1480 - __print_flags(__entry->high, "", 1481 - {(WQ_HIGHPRI), "-high"}), 1476 + TP_printk_btrfs("name=%s wq=%p", __get_str(name), 1482 1477 __entry->wq) 1483 1478 ); 1484 1479 1485 - DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc, 1480 + DEFINE_EVENT(btrfs_workqueue, btrfs_workqueue_alloc, 1486 1481 1487 - TP_PROTO(const struct __btrfs_workqueue *wq, 1488 - const char *name, int high), 1482 + TP_PROTO(const struct btrfs_workqueue *wq, const char *name), 1489 1483 1490 - TP_ARGS(wq, name, high) 1484 + TP_ARGS(wq, name) 1491 1485 ); 1492 1486 1493 - DECLARE_EVENT_CLASS(btrfs__workqueue_done, 1487 + DECLARE_EVENT_CLASS(btrfs_workqueue_done, 1494 1488 1495 - TP_PROTO(const struct __btrfs_workqueue *wq), 1489 + TP_PROTO(const struct btrfs_workqueue *wq), 1496 1490 1497 1491 TP_ARGS(wq), 1498 1492 ··· 1501 1507 TP_printk_btrfs("wq=%p", __entry->wq) 1502 1508 ); 1503 1509 1504 - DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, 1510 + DEFINE_EVENT(btrfs_workqueue_done, btrfs_workqueue_destroy, 1505 1511 1506 - TP_PROTO(const struct __btrfs_workqueue *wq), 1512 + TP_PROTO(const struct btrfs_workqueue *wq), 1507 1513 1508 1514 TP_ARGS(wq) 1509 1515 );

-13

include/uapi/linux/btrfs_tree.h

··· 880 880 #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ 881 881 BTRFS_SPACE_INFO_GLOBAL_RSV) 882 882 883 - enum btrfs_raid_types { 884 - BTRFS_RAID_RAID10, 885 - BTRFS_RAID_RAID1, 886 - BTRFS_RAID_DUP, 887 - BTRFS_RAID_RAID0, 888 - BTRFS_RAID_SINGLE, 889 - BTRFS_RAID_RAID5, 890 - BTRFS_RAID_RAID6, 891 - BTRFS_RAID_RAID1C3, 892 - BTRFS_RAID_RAID1C4, 893 - BTRFS_NR_RAID_TYPES 894 - }; 895 - 896 883 #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 897 884 BTRFS_BLOCK_GROUP_SYSTEM | \ 898 885 BTRFS_BLOCK_GROUP_METADATA)

Configure Feed

Configure Feed