Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs updates from Chris Mason:
"This includes a fairly large change from Josef around data writeback
completion. Before, the writeback wasn't completed until the metadata
insertions for the extent were done, and this made for fairly large
latency spikes on the last page of each ordered extent.

We already had a separate mechanism for tracking pending metadata
insertions, so Josef just needed to tweak things a little to end
writeback earlier on the page. Overall it makes us much friendly to
memory reclaim and lowers latencies quite a lot for synchronous IO.

Jan Schmidt has finished some background work required to track btree
blocks as they go through changes in ownership. It's the missing
piece he needed for both btrfs send/receive and subvolume quotas.
Neither of those are ready yet, but the new tracking code is included
here. Most of the time, the new code is off. It is only used by
scrub and other backref walkers.

Stefan Behrens has added io failure tracking. This includes counters
for which drives are causing the most trouble so the admin (or an
automated tool) can choose to kick them out. We're tracking IO
errors, crc errors, and generation checks we do on each metadata
block.

RAID5/6 did miss the cut this time because I'm having trouble with
corruptions. I'll nail it down next week and post as a beta testing
before 3.6"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (58 commits)
Btrfs: fix tree mod log rewinded level and rewinding of moved keys
Btrfs: fix tree mod log del_ptr
Btrfs: add tree_mod_dont_log helper
Btrfs: add missing spin_lock for insertion into tree mod log
Btrfs: add inodes before dropping the extent lock in find_all_leafs
Btrfs: use delayed ref sequence numbers for all fs-tree updates
Btrfs: fix false positive in check-integrity on unmount
Btrfs: fix runtime warning in check-integrity check data mode
Btrfs: set ioprio of scrub readahead to idle
Btrfs: fix return code in drop_objectid_items
Btrfs: check to see if the inode is in the log before fsyncing
Btrfs: return value of btrfs_read_buffer is checked correctly
Btrfs: read device stats on mount, write modified ones during commit
Btrfs: add ioctl to get and reset the device stats
Btrfs: add device counters for detected IO and checksum errors
btrfs: Drop unused function btrfs_abort_devices()
Btrfs: fix the same inode id problem when doing auto defragment
Btrfs: fall back to non-inline if we don't have enough space
Btrfs: fix how we deal with the orphan block rsv
Btrfs: convert the inode bit field to use the actual bit operations
...

+2850 -867
+4
fs/btrfs/acl.c
··· 227 227 if (ret > 0) { 228 228 /* we need an acl */ 229 229 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); 230 + } else { 231 + cache_no_acl(inode); 230 232 } 233 + } else { 234 + cache_no_acl(inode); 231 235 } 232 236 failed: 233 237 posix_acl_release(acl);
+350 -145
fs/btrfs/backref.c
··· 24 24 #include "delayed-ref.h" 25 25 #include "locking.h" 26 26 27 + struct extent_inode_elem { 28 + u64 inum; 29 + u64 offset; 30 + struct extent_inode_elem *next; 31 + }; 32 + 33 + static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, 34 + struct btrfs_file_extent_item *fi, 35 + u64 extent_item_pos, 36 + struct extent_inode_elem **eie) 37 + { 38 + u64 data_offset; 39 + u64 data_len; 40 + struct extent_inode_elem *e; 41 + 42 + data_offset = btrfs_file_extent_offset(eb, fi); 43 + data_len = btrfs_file_extent_num_bytes(eb, fi); 44 + 45 + if (extent_item_pos < data_offset || 46 + extent_item_pos >= data_offset + data_len) 47 + return 1; 48 + 49 + e = kmalloc(sizeof(*e), GFP_NOFS); 50 + if (!e) 51 + return -ENOMEM; 52 + 53 + e->next = *eie; 54 + e->inum = key->objectid; 55 + e->offset = key->offset + (extent_item_pos - data_offset); 56 + *eie = e; 57 + 58 + return 0; 59 + } 60 + 61 + static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, 62 + u64 extent_item_pos, 63 + struct extent_inode_elem **eie) 64 + { 65 + u64 disk_byte; 66 + struct btrfs_key key; 67 + struct btrfs_file_extent_item *fi; 68 + int slot; 69 + int nritems; 70 + int extent_type; 71 + int ret; 72 + 73 + /* 74 + * from the shared data ref, we only have the leaf but we need 75 + * the key. thus, we must look into all items and see that we 76 + * find one (some) with a reference to our extent item. 77 + */ 78 + nritems = btrfs_header_nritems(eb); 79 + for (slot = 0; slot < nritems; ++slot) { 80 + btrfs_item_key_to_cpu(eb, &key, slot); 81 + if (key.type != BTRFS_EXTENT_DATA_KEY) 82 + continue; 83 + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 84 + extent_type = btrfs_file_extent_type(eb, fi); 85 + if (extent_type == BTRFS_FILE_EXTENT_INLINE) 86 + continue; 87 + /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ 88 + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 89 + if (disk_byte != wanted_disk_byte) 90 + continue; 91 + 92 + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); 93 + if (ret < 0) 94 + return ret; 95 + } 96 + 97 + return 0; 98 + } 99 + 27 100 /* 28 101 * this structure records all encountered refs on the way up to the root 29 102 */ 30 103 struct __prelim_ref { 31 104 struct list_head list; 32 105 u64 root_id; 33 - struct btrfs_key key; 106 + struct btrfs_key key_for_search; 34 107 int level; 35 108 int count; 109 + struct extent_inode_elem *inode_list; 36 110 u64 parent; 37 111 u64 wanted_disk_byte; 38 112 }; 39 113 114 + /* 115 + * the rules for all callers of this function are: 116 + * - obtaining the parent is the goal 117 + * - if you add a key, you must know that it is a correct key 118 + * - if you cannot add the parent or a correct key, then we will look into the 119 + * block later to set a correct key 120 + * 121 + * delayed refs 122 + * ============ 123 + * backref type | shared | indirect | shared | indirect 124 + * information | tree | tree | data | data 125 + * --------------------+--------+----------+--------+---------- 126 + * parent logical | y | - | - | - 127 + * key to resolve | - | y | y | y 128 + * tree block logical | - | - | - | - 129 + * root for resolving | y | y | y | y 130 + * 131 + * - column 1: we've the parent -> done 132 + * - column 2, 3, 4: we use the key to find the parent 133 + * 134 + * on disk refs (inline or keyed) 135 + * ============================== 136 + * backref type | shared | indirect | shared | indirect 137 + * information | tree | tree | data | data 138 + * --------------------+--------+----------+--------+---------- 139 + * parent logical | y | - | y | - 140 + * key to resolve | - | - | - | y 141 + * tree block logical | y | y | y | y 142 + * root for resolving | - | y | y | y 143 + * 144 + * - column 1, 3: we've the parent -> done 145 + * - column 2: we take the first key from the block to find the parent 146 + * (see __add_missing_keys) 147 + * - column 4: we use the key to find the parent 148 + * 149 + * additional information that's available but not required to find the parent 150 + * block might help in merging entries to gain some speed. 151 + */ 152 + 40 153 static int __add_prelim_ref(struct list_head *head, u64 root_id, 41 - struct btrfs_key *key, int level, u64 parent, 42 - u64 wanted_disk_byte, int count) 154 + struct btrfs_key *key, int level, 155 + u64 parent, u64 wanted_disk_byte, int count) 43 156 { 44 157 struct __prelim_ref *ref; 45 158 ··· 163 50 164 51 ref->root_id = root_id; 165 52 if (key) 166 - ref->key = *key; 53 + ref->key_for_search = *key; 167 54 else 168 - memset(&ref->key, 0, sizeof(ref->key)); 55 + memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); 169 56 57 + ref->inode_list = NULL; 170 58 ref->level = level; 171 59 ref->count = count; 172 60 ref->parent = parent; ··· 178 64 } 179 65 180 66 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 181 - struct ulist *parents, 182 - struct extent_buffer *eb, int level, 183 - u64 wanted_objectid, u64 wanted_disk_byte) 67 + struct ulist *parents, int level, 68 + struct btrfs_key *key, u64 wanted_disk_byte, 69 + const u64 *extent_item_pos) 184 70 { 185 71 int ret; 186 - int slot; 72 + int slot = path->slots[level]; 73 + struct extent_buffer *eb = path->nodes[level]; 187 74 struct btrfs_file_extent_item *fi; 188 - struct btrfs_key key; 75 + struct extent_inode_elem *eie = NULL; 189 76 u64 disk_byte; 77 + u64 wanted_objectid = key->objectid; 190 78 191 79 add_parent: 192 - ret = ulist_add(parents, eb->start, 0, GFP_NOFS); 80 + if (level == 0 && extent_item_pos) { 81 + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 82 + ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); 83 + if (ret < 0) 84 + return ret; 85 + } 86 + ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); 193 87 if (ret < 0) 194 88 return ret; 195 89 ··· 211 89 * repeat this until we don't find any additional EXTENT_DATA items. 212 90 */ 213 91 while (1) { 92 + eie = NULL; 214 93 ret = btrfs_next_leaf(root, path); 215 94 if (ret < 0) 216 95 return ret; ··· 220 97 221 98 eb = path->nodes[0]; 222 99 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { 223 - btrfs_item_key_to_cpu(eb, &key, slot); 224 - if (key.objectid != wanted_objectid || 225 - key.type != BTRFS_EXTENT_DATA_KEY) 100 + btrfs_item_key_to_cpu(eb, key, slot); 101 + if (key->objectid != wanted_objectid || 102 + key->type != BTRFS_EXTENT_DATA_KEY) 226 103 return 0; 227 104 fi = btrfs_item_ptr(eb, slot, 228 105 struct btrfs_file_extent_item); ··· 241 118 */ 242 119 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 243 120 int search_commit_root, 121 + u64 time_seq, 244 122 struct __prelim_ref *ref, 245 - struct ulist *parents) 123 + struct ulist *parents, 124 + const u64 *extent_item_pos) 246 125 { 247 126 struct btrfs_path *path; 248 127 struct btrfs_root *root; ··· 277 152 goto out; 278 153 279 154 path->lowest_level = level; 280 - ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); 155 + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); 281 156 pr_debug("search slot in root %llu (level %d, ref count %d) returned " 282 157 "%d for key (%llu %u %llu)\n", 283 158 (unsigned long long)ref->root_id, level, ref->count, ret, 284 - (unsigned long long)ref->key.objectid, ref->key.type, 285 - (unsigned long long)ref->key.offset); 159 + (unsigned long long)ref->key_for_search.objectid, 160 + ref->key_for_search.type, 161 + (unsigned long long)ref->key_for_search.offset); 286 162 if (ret < 0) 287 163 goto out; 288 164 ··· 305 179 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 306 180 } 307 181 308 - /* the last two parameters will only be used for level == 0 */ 309 - ret = add_all_parents(root, path, parents, eb, level, key.objectid, 310 - ref->wanted_disk_byte); 182 + ret = add_all_parents(root, path, parents, level, &key, 183 + ref->wanted_disk_byte, extent_item_pos); 311 184 out: 312 185 btrfs_free_path(path); 313 186 return ret; ··· 316 191 * resolve all indirect backrefs from the list 317 192 */ 318 193 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 319 - int search_commit_root, 320 - struct list_head *head) 194 + int search_commit_root, u64 time_seq, 195 + struct list_head *head, 196 + const u64 *extent_item_pos) 321 197 { 322 198 int err; 323 199 int ret = 0; ··· 327 201 struct __prelim_ref *new_ref; 328 202 struct ulist *parents; 329 203 struct ulist_node *node; 204 + struct ulist_iterator uiter; 330 205 331 206 parents = ulist_alloc(GFP_NOFS); 332 207 if (!parents) ··· 344 217 if (ref->count == 0) 345 218 continue; 346 219 err = __resolve_indirect_ref(fs_info, search_commit_root, 347 - ref, parents); 220 + time_seq, ref, parents, 221 + extent_item_pos); 348 222 if (err) { 349 223 if (ret == 0) 350 224 ret = err; ··· 353 225 } 354 226 355 227 /* we put the first parent into the ref at hand */ 356 - node = ulist_next(parents, NULL); 228 + ULIST_ITER_INIT(&uiter); 229 + node = ulist_next(parents, &uiter); 357 230 ref->parent = node ? node->val : 0; 231 + ref->inode_list = 232 + node ? (struct extent_inode_elem *)node->aux : 0; 358 233 359 234 /* additional parents require new refs being added here */ 360 - while ((node = ulist_next(parents, node))) { 235 + while ((node = ulist_next(parents, &uiter))) { 361 236 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); 362 237 if (!new_ref) { 363 238 ret = -ENOMEM; ··· 368 237 } 369 238 memcpy(new_ref, ref, sizeof(*ref)); 370 239 new_ref->parent = node->val; 240 + new_ref->inode_list = 241 + (struct extent_inode_elem *)node->aux; 371 242 list_add(&new_ref->list, &ref->list); 372 243 } 373 244 ulist_reinit(parents); ··· 379 246 return ret; 380 247 } 381 248 249 + static inline int ref_for_same_block(struct __prelim_ref *ref1, 250 + struct __prelim_ref *ref2) 251 + { 252 + if (ref1->level != ref2->level) 253 + return 0; 254 + if (ref1->root_id != ref2->root_id) 255 + return 0; 256 + if (ref1->key_for_search.type != ref2->key_for_search.type) 257 + return 0; 258 + if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) 259 + return 0; 260 + if (ref1->key_for_search.offset != ref2->key_for_search.offset) 261 + return 0; 262 + if (ref1->parent != ref2->parent) 263 + return 0; 264 + 265 + return 1; 266 + } 267 + 268 + /* 269 + * read tree blocks and add keys where required. 270 + */ 271 + static int __add_missing_keys(struct btrfs_fs_info *fs_info, 272 + struct list_head *head) 273 + { 274 + struct list_head *pos; 275 + struct extent_buffer *eb; 276 + 277 + list_for_each(pos, head) { 278 + struct __prelim_ref *ref; 279 + ref = list_entry(pos, struct __prelim_ref, list); 280 + 281 + if (ref->parent) 282 + continue; 283 + if (ref->key_for_search.type) 284 + continue; 285 + BUG_ON(!ref->wanted_disk_byte); 286 + eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 287 + fs_info->tree_root->leafsize, 0); 288 + BUG_ON(!eb); 289 + btrfs_tree_read_lock(eb); 290 + if (btrfs_header_level(eb) == 0) 291 + btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); 292 + else 293 + btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); 294 + btrfs_tree_read_unlock(eb); 295 + free_extent_buffer(eb); 296 + } 297 + return 0; 298 + } 299 + 382 300 /* 383 301 * merge two lists of backrefs and adjust counts accordingly 384 302 * 385 303 * mode = 1: merge identical keys, if key is set 304 + * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. 305 + * additionally, we could even add a key range for the blocks we 306 + * looked into to merge even more (-> replace unresolved refs by those 307 + * having a parent). 386 308 * mode = 2: merge identical parents 387 309 */ 388 310 static int __merge_refs(struct list_head *head, int mode) ··· 451 263 452 264 ref1 = list_entry(pos1, struct __prelim_ref, list); 453 265 454 - if (mode == 1 && ref1->key.type == 0) 455 - continue; 456 266 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; 457 267 pos2 = n2, n2 = pos2->next) { 458 268 struct __prelim_ref *ref2; 269 + struct __prelim_ref *xchg; 459 270 460 271 ref2 = list_entry(pos2, struct __prelim_ref, list); 461 272 462 273 if (mode == 1) { 463 - if (memcmp(&ref1->key, &ref2->key, 464 - sizeof(ref1->key)) || 465 - ref1->level != ref2->level || 466 - ref1->root_id != ref2->root_id) 274 + if (!ref_for_same_block(ref1, ref2)) 467 275 continue; 276 + if (!ref1->parent && ref2->parent) { 277 + xchg = ref1; 278 + ref1 = ref2; 279 + ref2 = xchg; 280 + } 468 281 ref1->count += ref2->count; 469 282 } else { 470 283 if (ref1->parent != ref2->parent) ··· 485 296 * smaller or equal that seq to the list 486 297 */ 487 298 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 488 - struct btrfs_key *info_key, 489 299 struct list_head *prefs) 490 300 { 491 301 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 492 302 struct rb_node *n = &head->node.rb_node; 303 + struct btrfs_key key; 304 + struct btrfs_key op_key = {0}; 493 305 int sgn; 494 306 int ret = 0; 495 307 496 308 if (extent_op && extent_op->update_key) 497 - btrfs_disk_key_to_cpu(info_key, &extent_op->key); 309 + btrfs_disk_key_to_cpu(&op_key, &extent_op->key); 498 310 499 311 while ((n = rb_prev(n))) { 500 312 struct btrfs_delayed_ref_node *node; ··· 527 337 struct btrfs_delayed_tree_ref *ref; 528 338 529 339 ref = btrfs_delayed_node_to_tree_ref(node); 530 - ret = __add_prelim_ref(prefs, ref->root, info_key, 340 + ret = __add_prelim_ref(prefs, ref->root, &op_key, 531 341 ref->level + 1, 0, node->bytenr, 532 342 node->ref_mod * sgn); 533 343 break; ··· 536 346 struct btrfs_delayed_tree_ref *ref; 537 347 538 348 ref = btrfs_delayed_node_to_tree_ref(node); 539 - ret = __add_prelim_ref(prefs, ref->root, info_key, 349 + ret = __add_prelim_ref(prefs, ref->root, NULL, 540 350 ref->level + 1, ref->parent, 541 351 node->bytenr, 542 352 node->ref_mod * sgn); ··· 544 354 } 545 355 case BTRFS_EXTENT_DATA_REF_KEY: { 546 356 struct btrfs_delayed_data_ref *ref; 547 - struct btrfs_key key; 548 - 549 357 ref = btrfs_delayed_node_to_data_ref(node); 550 358 551 359 key.objectid = ref->objectid; ··· 556 368 } 557 369 case BTRFS_SHARED_DATA_REF_KEY: { 558 370 struct btrfs_delayed_data_ref *ref; 559 - struct btrfs_key key; 560 371 561 372 ref = btrfs_delayed_node_to_data_ref(node); 562 373 ··· 581 394 */ 582 395 static int __add_inline_refs(struct btrfs_fs_info *fs_info, 583 396 struct btrfs_path *path, u64 bytenr, 584 - struct btrfs_key *info_key, int *info_level, 585 - struct list_head *prefs) 397 + int *info_level, struct list_head *prefs) 586 398 { 587 399 int ret = 0; 588 400 int slot; ··· 597 411 * enumerate all inline refs 598 412 */ 599 413 leaf = path->nodes[0]; 600 - slot = path->slots[0] - 1; 414 + slot = path->slots[0]; 601 415 602 416 item_size = btrfs_item_size_nr(leaf, slot); 603 417 BUG_ON(item_size < sizeof(*ei)); ··· 610 424 611 425 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 612 426 struct btrfs_tree_block_info *info; 613 - struct btrfs_disk_key disk_key; 614 427 615 428 info = (struct btrfs_tree_block_info *)ptr; 616 429 *info_level = btrfs_tree_block_level(leaf, info); 617 - btrfs_tree_block_key(leaf, info, &disk_key); 618 - btrfs_disk_key_to_cpu(info_key, &disk_key); 619 430 ptr += sizeof(struct btrfs_tree_block_info); 620 431 BUG_ON(ptr > end); 621 432 } else { ··· 630 447 631 448 switch (type) { 632 449 case BTRFS_SHARED_BLOCK_REF_KEY: 633 - ret = __add_prelim_ref(prefs, 0, info_key, 450 + ret = __add_prelim_ref(prefs, 0, NULL, 634 451 *info_level + 1, offset, 635 452 bytenr, 1); 636 453 break; ··· 645 462 break; 646 463 } 647 464 case BTRFS_TREE_BLOCK_REF_KEY: 648 - ret = __add_prelim_ref(prefs, offset, info_key, 649 - *info_level + 1, 0, bytenr, 1); 465 + ret = __add_prelim_ref(prefs, offset, NULL, 466 + *info_level + 1, 0, 467 + bytenr, 1); 650 468 break; 651 469 case BTRFS_EXTENT_DATA_REF_KEY: { 652 470 struct btrfs_extent_data_ref *dref; ··· 661 477 key.type = BTRFS_EXTENT_DATA_KEY; 662 478 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 663 479 root = btrfs_extent_data_ref_root(leaf, dref); 664 - ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, 665 - count); 480 + ret = __add_prelim_ref(prefs, root, &key, 0, 0, 481 + bytenr, count); 666 482 break; 667 483 } 668 484 default: ··· 680 496 */ 681 497 static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 682 498 struct btrfs_path *path, u64 bytenr, 683 - struct btrfs_key *info_key, int info_level, 684 - struct list_head *prefs) 499 + int info_level, struct list_head *prefs) 685 500 { 686 501 struct btrfs_root *extent_root = fs_info->extent_root; 687 502 int ret; ··· 710 527 711 528 switch (key.type) { 712 529 case BTRFS_SHARED_BLOCK_REF_KEY: 713 - ret = __add_prelim_ref(prefs, 0, info_key, 530 + ret = __add_prelim_ref(prefs, 0, NULL, 714 531 info_level + 1, key.offset, 715 532 bytenr, 1); 716 533 break; ··· 726 543 break; 727 544 } 728 545 case BTRFS_TREE_BLOCK_REF_KEY: 729 - ret = __add_prelim_ref(prefs, key.offset, info_key, 730 - info_level + 1, 0, bytenr, 1); 546 + ret = __add_prelim_ref(prefs, key.offset, NULL, 547 + info_level + 1, 0, 548 + bytenr, 1); 731 549 break; 732 550 case BTRFS_EXTENT_DATA_REF_KEY: { 733 551 struct btrfs_extent_data_ref *dref; ··· 744 560 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 745 561 root = btrfs_extent_data_ref_root(leaf, dref); 746 562 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 747 - bytenr, count); 563 + bytenr, count); 748 564 break; 749 565 } 750 566 default: ··· 766 582 */ 767 583 static int find_parent_nodes(struct btrfs_trans_handle *trans, 768 584 struct btrfs_fs_info *fs_info, u64 bytenr, 769 - u64 seq, struct ulist *refs, struct ulist *roots) 585 + u64 delayed_ref_seq, u64 time_seq, 586 + struct ulist *refs, struct ulist *roots, 587 + const u64 *extent_item_pos) 770 588 { 771 589 struct btrfs_key key; 772 590 struct btrfs_path *path; 773 - struct btrfs_key info_key = { 0 }; 774 591 struct btrfs_delayed_ref_root *delayed_refs = NULL; 775 592 struct btrfs_delayed_ref_head *head; 776 593 int info_level = 0; ··· 830 645 btrfs_put_delayed_ref(&head->node); 831 646 goto again; 832 647 } 833 - ret = __add_delayed_refs(head, seq, &info_key, 648 + ret = __add_delayed_refs(head, delayed_ref_seq, 834 649 &prefs_delayed); 835 650 if (ret) { 836 651 spin_unlock(&delayed_refs->lock); ··· 844 659 struct extent_buffer *leaf; 845 660 int slot; 846 661 662 + path->slots[0]--; 847 663 leaf = path->nodes[0]; 848 - slot = path->slots[0] - 1; 664 + slot = path->slots[0]; 849 665 btrfs_item_key_to_cpu(leaf, &key, slot); 850 666 if (key.objectid == bytenr && 851 667 key.type == BTRFS_EXTENT_ITEM_KEY) { 852 668 ret = __add_inline_refs(fs_info, path, bytenr, 853 - &info_key, &info_level, &prefs); 669 + &info_level, &prefs); 854 670 if (ret) 855 671 goto out; 856 - ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, 672 + ret = __add_keyed_refs(fs_info, path, bytenr, 857 673 info_level, &prefs); 858 674 if (ret) 859 675 goto out; ··· 862 676 } 863 677 btrfs_release_path(path); 864 678 865 - /* 866 - * when adding the delayed refs above, the info_key might not have 867 - * been known yet. Go over the list and replace the missing keys 868 - */ 869 - list_for_each_entry(ref, &prefs_delayed, list) { 870 - if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) 871 - memcpy(&ref->key, &info_key, sizeof(ref->key)); 872 - } 873 679 list_splice_init(&prefs_delayed, &prefs); 680 + 681 + ret = __add_missing_keys(fs_info, &prefs); 682 + if (ret) 683 + goto out; 874 684 875 685 ret = __merge_refs(&prefs, 1); 876 686 if (ret) 877 687 goto out; 878 688 879 - ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); 689 + ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, 690 + &prefs, extent_item_pos); 880 691 if (ret) 881 692 goto out; 882 693 ··· 892 709 BUG_ON(ret < 0); 893 710 } 894 711 if (ref->count && ref->parent) { 895 - ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); 712 + struct extent_inode_elem *eie = NULL; 713 + if (extent_item_pos && !ref->inode_list) { 714 + u32 bsz; 715 + struct extent_buffer *eb; 716 + bsz = btrfs_level_size(fs_info->extent_root, 717 + info_level); 718 + eb = read_tree_block(fs_info->extent_root, 719 + ref->parent, bsz, 0); 720 + BUG_ON(!eb); 721 + ret = find_extent_in_eb(eb, bytenr, 722 + *extent_item_pos, &eie); 723 + ref->inode_list = eie; 724 + free_extent_buffer(eb); 725 + } 726 + ret = ulist_add_merge(refs, ref->parent, 727 + (unsigned long)ref->inode_list, 728 + (unsigned long *)&eie, GFP_NOFS); 729 + if (!ret && extent_item_pos) { 730 + /* 731 + * we've recorded that parent, so we must extend 732 + * its inode list here 733 + */ 734 + BUG_ON(!eie); 735 + while (eie->next) 736 + eie = eie->next; 737 + eie->next = ref->inode_list; 738 + } 896 739 BUG_ON(ret < 0); 897 740 } 898 741 kfree(ref); ··· 943 734 return ret; 944 735 } 945 736 737 + static void free_leaf_list(struct ulist *blocks) 738 + { 739 + struct ulist_node *node = NULL; 740 + struct extent_inode_elem *eie; 741 + struct extent_inode_elem *eie_next; 742 + struct ulist_iterator uiter; 743 + 744 + ULIST_ITER_INIT(&uiter); 745 + while ((node = ulist_next(blocks, &uiter))) { 746 + if (!node->aux) 747 + continue; 748 + eie = (struct extent_inode_elem *)node->aux; 749 + for (; eie; eie = eie_next) { 750 + eie_next = eie->next; 751 + kfree(eie); 752 + } 753 + node->aux = 0; 754 + } 755 + 756 + ulist_free(blocks); 757 + } 758 + 946 759 /* 947 760 * Finds all leafs with a reference to the specified combination of bytenr and 948 761 * offset. key_list_head will point to a list of corresponding keys (caller must ··· 975 744 */ 976 745 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, 977 746 struct btrfs_fs_info *fs_info, u64 bytenr, 978 - u64 num_bytes, u64 seq, struct ulist **leafs) 747 + u64 delayed_ref_seq, u64 time_seq, 748 + struct ulist **leafs, 749 + const u64 *extent_item_pos) 979 750 { 980 751 struct ulist *tmp; 981 752 int ret; ··· 991 758 return -ENOMEM; 992 759 } 993 760 994 - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); 761 + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 762 + time_seq, *leafs, tmp, extent_item_pos); 995 763 ulist_free(tmp); 996 764 997 765 if (ret < 0 && ret != -ENOENT) { 998 - ulist_free(*leafs); 766 + free_leaf_list(*leafs); 999 767 return ret; 1000 768 } 1001 769 ··· 1018 784 */ 1019 785 int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 1020 786 struct btrfs_fs_info *fs_info, u64 bytenr, 1021 - u64 num_bytes, u64 seq, struct ulist **roots) 787 + u64 delayed_ref_seq, u64 time_seq, 788 + struct ulist **roots) 1022 789 { 1023 790 struct ulist *tmp; 1024 791 struct ulist_node *node = NULL; 792 + struct ulist_iterator uiter; 1025 793 int ret; 1026 794 1027 795 tmp = ulist_alloc(GFP_NOFS); ··· 1035 799 return -ENOMEM; 1036 800 } 1037 801 802 + ULIST_ITER_INIT(&uiter); 1038 803 while (1) { 1039 - ret = find_parent_nodes(trans, fs_info, bytenr, seq, 1040 - tmp, *roots); 804 + ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq, 805 + time_seq, tmp, *roots, NULL); 1041 806 if (ret < 0 && ret != -ENOENT) { 1042 807 ulist_free(tmp); 1043 808 ulist_free(*roots); 1044 809 return ret; 1045 810 } 1046 - node = ulist_next(tmp, node); 811 + node = ulist_next(tmp, &uiter); 1047 812 if (!node) 1048 813 break; 1049 814 bytenr = node->val; ··· 1330 1093 return 0; 1331 1094 } 1332 1095 1333 - static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, 1334 - u64 orig_extent_item_objectid, 1335 - u64 extent_item_pos, u64 root, 1096 + static int iterate_leaf_refs(struct extent_inode_elem *inode_list, 1097 + u64 root, u64 extent_item_objectid, 1336 1098 iterate_extent_inodes_t *iterate, void *ctx) 1337 1099 { 1338 - u64 disk_byte; 1339 - struct btrfs_key key; 1340 - struct btrfs_file_extent_item *fi; 1341 - struct extent_buffer *eb; 1342 - int slot; 1343 - int nritems; 1100 + struct extent_inode_elem *eie; 1344 1101 int ret = 0; 1345 - int extent_type; 1346 - u64 data_offset; 1347 - u64 data_len; 1348 1102 1349 - eb = read_tree_block(fs_info->tree_root, logical, 1350 - fs_info->tree_root->leafsize, 0); 1351 - if (!eb) 1352 - return -EIO; 1353 - 1354 - /* 1355 - * from the shared data ref, we only have the leaf but we need 1356 - * the key. thus, we must look into all items and see that we 1357 - * find one (some) with a reference to our extent item. 1358 - */ 1359 - nritems = btrfs_header_nritems(eb); 1360 - for (slot = 0; slot < nritems; ++slot) { 1361 - btrfs_item_key_to_cpu(eb, &key, slot); 1362 - if (key.type != BTRFS_EXTENT_DATA_KEY) 1363 - continue; 1364 - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 1365 - extent_type = btrfs_file_extent_type(eb, fi); 1366 - if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1367 - continue; 1368 - /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ 1369 - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1370 - if (disk_byte != orig_extent_item_objectid) 1371 - continue; 1372 - 1373 - data_offset = btrfs_file_extent_offset(eb, fi); 1374 - data_len = btrfs_file_extent_num_bytes(eb, fi); 1375 - 1376 - if (extent_item_pos < data_offset || 1377 - extent_item_pos >= data_offset + data_len) 1378 - continue; 1379 - 1103 + for (eie = inode_list; eie; eie = eie->next) { 1380 1104 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " 1381 - "root %llu\n", orig_extent_item_objectid, 1382 - key.objectid, key.offset, root); 1383 - ret = iterate(key.objectid, 1384 - key.offset + (extent_item_pos - data_offset), 1385 - root, ctx); 1105 + "root %llu\n", extent_item_objectid, 1106 + eie->inum, eie->offset, root); 1107 + ret = iterate(eie->inum, eie->offset, root, ctx); 1386 1108 if (ret) { 1387 - pr_debug("stopping iteration because ret=%d\n", ret); 1109 + pr_debug("stopping iteration for %llu due to ret=%d\n", 1110 + extent_item_objectid, ret); 1388 1111 break; 1389 1112 } 1390 1113 } 1391 - 1392 - free_extent_buffer(eb); 1393 1114 1394 1115 return ret; 1395 1116 } ··· 1370 1175 struct ulist *roots = NULL; 1371 1176 struct ulist_node *ref_node = NULL; 1372 1177 struct ulist_node *root_node = NULL; 1373 - struct seq_list seq_elem; 1178 + struct seq_list seq_elem = {}; 1179 + struct seq_list tree_mod_seq_elem = {}; 1180 + struct ulist_iterator ref_uiter; 1181 + struct ulist_iterator root_uiter; 1374 1182 struct btrfs_delayed_ref_root *delayed_refs = NULL; 1375 1183 1376 1184 pr_debug("resolving all inodes for extent %llu\n", ··· 1390 1192 spin_lock(&delayed_refs->lock); 1391 1193 btrfs_get_delayed_seq(delayed_refs, &seq_elem); 1392 1194 spin_unlock(&delayed_refs->lock); 1195 + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1393 1196 } 1394 1197 1395 1198 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, 1396 - extent_item_pos, seq_elem.seq, 1397 - &refs); 1398 - 1199 + seq_elem.seq, tree_mod_seq_elem.seq, &refs, 1200 + &extent_item_pos); 1399 1201 if (ret) 1400 1202 goto out; 1401 1203 1402 - while (!ret && (ref_node = ulist_next(refs, ref_node))) { 1403 - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, 1404 - seq_elem.seq, &roots); 1204 + ULIST_ITER_INIT(&ref_uiter); 1205 + while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { 1206 + ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, 1207 + seq_elem.seq, 1208 + tree_mod_seq_elem.seq, &roots); 1405 1209 if (ret) 1406 1210 break; 1407 - while (!ret && (root_node = ulist_next(roots, root_node))) { 1408 - pr_debug("root %llu references leaf %llu\n", 1409 - root_node->val, ref_node->val); 1410 - ret = iterate_leaf_refs(fs_info, ref_node->val, 1411 - extent_item_objectid, 1412 - extent_item_pos, root_node->val, 1413 - iterate, ctx); 1211 + ULIST_ITER_INIT(&root_uiter); 1212 + while (!ret && (root_node = ulist_next(roots, &root_uiter))) { 1213 + pr_debug("root %llu references leaf %llu, data list " 1214 + "%#lx\n", root_node->val, ref_node->val, 1215 + ref_node->aux); 1216 + ret = iterate_leaf_refs( 1217 + (struct extent_inode_elem *)ref_node->aux, 1218 + root_node->val, extent_item_objectid, 1219 + iterate, ctx); 1414 1220 } 1221 + ulist_free(roots); 1222 + roots = NULL; 1415 1223 } 1416 1224 1417 - ulist_free(refs); 1225 + free_leaf_list(refs); 1418 1226 ulist_free(roots); 1419 1227 out: 1420 1228 if (!search_commit_root) { 1229 + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 1421 1230 btrfs_put_delayed_seq(delayed_refs, &seq_elem); 1422 1231 btrfs_end_transaction(trans, fs_info->extent_root); 1423 1232 }
+2 -1
fs/btrfs/backref.h
··· 58 58 59 59 int btrfs_find_all_roots(struct btrfs_trans_handle *trans, 60 60 struct btrfs_fs_info *fs_info, u64 bytenr, 61 - u64 num_bytes, u64 seq, struct ulist **roots); 61 + u64 delayed_ref_seq, u64 time_seq, 62 + struct ulist **roots); 62 63 63 64 struct btrfs_data_container *init_data_container(u32 total_bytes); 64 65 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+30 -20
fs/btrfs/btrfs_inode.h
··· 24 24 #include "ordered-data.h" 25 25 #include "delayed-inode.h" 26 26 27 + /* 28 + * ordered_data_close is set by truncate when a file that used 29 + * to have good data has been truncated to zero. When it is set 30 + * the btrfs file release call will add this inode to the 31 + * ordered operations list so that we make sure to flush out any 32 + * new data the application may have written before commit. 33 + */ 34 + #define BTRFS_INODE_ORDERED_DATA_CLOSE 0 35 + #define BTRFS_INODE_ORPHAN_META_RESERVED 1 36 + #define BTRFS_INODE_DUMMY 2 37 + #define BTRFS_INODE_IN_DEFRAG 3 38 + #define BTRFS_INODE_DELALLOC_META_RESERVED 4 39 + #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 40 + 27 41 /* in memory btrfs inode */ 28 42 struct btrfs_inode { 29 43 /* which subvolume this inode belongs to */ ··· 71 57 /* used to order data wrt metadata */ 72 58 struct btrfs_ordered_inode_tree ordered_tree; 73 59 74 - /* for keeping track of orphaned inodes */ 75 - struct list_head i_orphan; 76 - 77 60 /* list of all the delalloc inodes in the FS. There are times we need 78 61 * to write all the delalloc pages to disk, and this list is used 79 62 * to walk them all. ··· 89 78 /* the space_info for where this inode's data allocations are done */ 90 79 struct btrfs_space_info *space_info; 91 80 81 + unsigned long runtime_flags; 82 + 92 83 /* full 64 bit generation number, struct vfs_inode doesn't have a big 93 84 * enough field for this. 94 85 */ 95 86 u64 generation; 96 - 97 - /* sequence number for NFS changes */ 98 - u64 sequence; 99 87 100 88 /* 101 89 * transid of the trans_handle that last modified this inode ··· 155 145 unsigned reserved_extents; 156 146 157 147 /* 158 - * ordered_data_close is set by truncate when a file that used 159 - * to have good data has been truncated to zero. When it is set 160 - * the btrfs file release call will add this inode to the 161 - * ordered operations list so that we make sure to flush out any 162 - * new data the application may have written before commit. 163 - */ 164 - unsigned ordered_data_close:1; 165 - unsigned orphan_meta_reserved:1; 166 - unsigned dummy_inode:1; 167 - unsigned in_defrag:1; 168 - unsigned delalloc_meta_reserved:1; 169 - 170 - /* 171 148 * always compress this one file 172 149 */ 173 - unsigned force_compress:4; 150 + unsigned force_compress; 174 151 175 152 struct btrfs_delayed_node *delayed_node; 176 153 ··· 197 200 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) 198 201 return true; 199 202 return false; 203 + } 204 + 205 + static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 206 + { 207 + struct btrfs_root *root = BTRFS_I(inode)->root; 208 + int ret = 0; 209 + 210 + mutex_lock(&root->log_mutex); 211 + if (BTRFS_I(inode)->logged_trans == generation && 212 + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 213 + ret = 1; 214 + mutex_unlock(&root->log_mutex); 215 + return ret; 200 216 } 201 217 202 218 #endif
+437 -149
fs/btrfs/check-integrity.c
··· 103 103 #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 104 104 #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, 105 105 * excluding " [...]" */ 106 - #define BTRFSIC_BLOCK_SIZE PAGE_SIZE 107 - 108 106 #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) 109 107 110 108 /* ··· 208 210 u64 dev_bytenr; /* physical bytenr on device */ 209 211 u32 len; 210 212 struct btrfsic_dev_state *dev; 211 - char *data; 212 - struct buffer_head *bh; /* do not use if set to NULL */ 213 + char **datav; 214 + struct page **pagev; 215 + void *mem_to_free; 213 216 }; 214 217 215 218 /* This structure is used to implement recursion without occupying ··· 242 243 struct btrfs_root *root; 243 244 u64 max_superblock_generation; 244 245 struct btrfsic_block *latest_superblock; 246 + u32 metablock_size; 247 + u32 datablock_size; 245 248 }; 246 249 247 250 static void btrfsic_block_init(struct btrfsic_block *b); ··· 291 290 static int btrfsic_process_metablock(struct btrfsic_state *state, 292 291 struct btrfsic_block *block, 293 292 struct btrfsic_block_data_ctx *block_ctx, 294 - struct btrfs_header *hdr, 295 293 int limit_nesting, int force_iodone_flag); 294 + static void btrfsic_read_from_block_data( 295 + struct btrfsic_block_data_ctx *block_ctx, 296 + void *dst, u32 offset, size_t len); 296 297 static int btrfsic_create_link_to_next_block( 297 298 struct btrfsic_state *state, 298 299 struct btrfsic_block *block, ··· 321 318 static int btrfsic_read_block(struct btrfsic_state *state, 322 319 struct btrfsic_block_data_ctx *block_ctx); 323 320 static void btrfsic_dump_database(struct btrfsic_state *state); 321 + static void btrfsic_complete_bio_end_io(struct bio *bio, int err); 324 322 static int btrfsic_test_for_metadata(struct btrfsic_state *state, 325 - const u8 *data, unsigned int size); 323 + char **datav, unsigned int num_pages); 326 324 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 327 - u64 dev_bytenr, u8 *mapped_data, 328 - unsigned int len, struct bio *bio, 329 - int *bio_is_patched, 325 + u64 dev_bytenr, char **mapped_datav, 326 + unsigned int num_pages, 327 + struct bio *bio, int *bio_is_patched, 330 328 struct buffer_head *bh, 331 329 int submit_bio_bh_rw); 332 330 static int btrfsic_process_written_superblock( ··· 379 375 static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 380 376 u64 bytenr, 381 377 struct btrfsic_dev_state *dev_state, 382 - u64 dev_bytenr, char *data); 378 + u64 dev_bytenr); 383 379 384 380 static struct mutex btrfsic_mutex; 385 381 static int btrfsic_is_initialized; ··· 655 651 int pass; 656 652 657 653 BUG_ON(NULL == state); 658 - selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); 654 + selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); 659 655 if (NULL == selected_super) { 660 656 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 661 657 return -1; ··· 722 718 723 719 num_copies = 724 720 btrfs_num_copies(&state->root->fs_info->mapping_tree, 725 - next_bytenr, PAGE_SIZE); 721 + next_bytenr, state->metablock_size); 726 722 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 727 723 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 728 724 (unsigned long long)next_bytenr, num_copies); ··· 731 727 struct btrfsic_block *next_block; 732 728 struct btrfsic_block_data_ctx tmp_next_block_ctx; 733 729 struct btrfsic_block_link *l; 734 - struct btrfs_header *hdr; 735 730 736 - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 731 + ret = btrfsic_map_block(state, next_bytenr, 732 + state->metablock_size, 737 733 &tmp_next_block_ctx, 738 734 mirror_num); 739 735 if (ret) { ··· 762 758 BUG_ON(NULL == l); 763 759 764 760 ret = btrfsic_read_block(state, &tmp_next_block_ctx); 765 - if (ret < (int)BTRFSIC_BLOCK_SIZE) { 761 + if (ret < (int)PAGE_CACHE_SIZE) { 766 762 printk(KERN_INFO 767 763 "btrfsic: read @logical %llu failed!\n", 768 764 (unsigned long long) ··· 772 768 return -1; 773 769 } 774 770 775 - hdr = (struct btrfs_header *)tmp_next_block_ctx.data; 776 771 ret = btrfsic_process_metablock(state, 777 772 next_block, 778 773 &tmp_next_block_ctx, 779 - hdr, 780 774 BTRFS_MAX_LEVEL + 3, 1); 781 775 btrfsic_release_block_ctx(&tmp_next_block_ctx); 782 776 } ··· 801 799 802 800 /* super block bytenr is always the unmapped device bytenr */ 803 801 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 804 - bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); 802 + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 803 + return -1; 804 + bh = __bread(superblock_bdev, dev_bytenr / 4096, 805 + BTRFS_SUPER_INFO_SIZE); 805 806 if (NULL == bh) 806 807 return -1; 807 808 super_tmp = (struct btrfs_super_block *) ··· 813 808 if (btrfs_super_bytenr(super_tmp) != dev_bytenr || 814 809 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, 815 810 sizeof(super_tmp->magic)) || 816 - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { 811 + memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 812 + btrfs_super_nodesize(super_tmp) != state->metablock_size || 813 + btrfs_super_leafsize(super_tmp) != state->metablock_size || 814 + btrfs_super_sectorsize(super_tmp) != state->datablock_size) { 817 815 brelse(bh); 818 816 return 0; 819 817 } ··· 901 893 902 894 num_copies = 903 895 btrfs_num_copies(&state->root->fs_info->mapping_tree, 904 - next_bytenr, PAGE_SIZE); 896 + next_bytenr, state->metablock_size); 905 897 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 906 898 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 907 899 (unsigned long long)next_bytenr, num_copies); ··· 910 902 struct btrfsic_block_data_ctx tmp_next_block_ctx; 911 903 struct btrfsic_block_link *l; 912 904 913 - if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 905 + if (btrfsic_map_block(state, next_bytenr, 906 + state->metablock_size, 914 907 &tmp_next_block_ctx, 915 908 mirror_num)) { 916 909 printk(KERN_INFO "btrfsic: btrfsic_map_block(" ··· 975 966 struct btrfsic_state *state, 976 967 struct btrfsic_block *const first_block, 977 968 struct btrfsic_block_data_ctx *const first_block_ctx, 978 - struct btrfs_header *const first_hdr, 979 969 int first_limit_nesting, int force_iodone_flag) 980 970 { 981 971 struct btrfsic_stack_frame initial_stack_frame = { 0 }; 982 972 struct btrfsic_stack_frame *sf; 983 973 struct btrfsic_stack_frame *next_stack; 974 + struct btrfs_header *const first_hdr = 975 + (struct btrfs_header *)first_block_ctx->datav[0]; 984 976 977 + BUG_ON(!first_hdr); 985 978 sf = &initial_stack_frame; 986 979 sf->error = 0; 987 980 sf->i = -1; ··· 1023 1012 } 1024 1013 1025 1014 if (sf->i < sf->nr) { 1026 - struct btrfs_item *disk_item = leafhdr->items + sf->i; 1027 - struct btrfs_disk_key *disk_key = &disk_item->key; 1015 + struct btrfs_item disk_item; 1016 + u32 disk_item_offset = 1017 + (uintptr_t)(leafhdr->items + sf->i) - 1018 + (uintptr_t)leafhdr; 1019 + struct btrfs_disk_key *disk_key; 1028 1020 u8 type; 1029 - const u32 item_offset = le32_to_cpu(disk_item->offset); 1021 + u32 item_offset; 1030 1022 1023 + if (disk_item_offset + sizeof(struct btrfs_item) > 1024 + sf->block_ctx->len) { 1025 + leaf_item_out_of_bounce_error: 1026 + printk(KERN_INFO 1027 + "btrfsic: leaf item out of bounce at logical %llu, dev %s\n", 1028 + sf->block_ctx->start, 1029 + sf->block_ctx->dev->name); 1030 + goto one_stack_frame_backwards; 1031 + } 1032 + btrfsic_read_from_block_data(sf->block_ctx, 1033 + &disk_item, 1034 + disk_item_offset, 1035 + sizeof(struct btrfs_item)); 1036 + item_offset = le32_to_cpu(disk_item.offset); 1037 + disk_key = &disk_item.key; 1031 1038 type = disk_key->type; 1032 1039 1033 1040 if (BTRFS_ROOT_ITEM_KEY == type) { 1034 - const struct btrfs_root_item *const root_item = 1035 - (struct btrfs_root_item *) 1036 - (sf->block_ctx->data + 1037 - offsetof(struct btrfs_leaf, items) + 1038 - item_offset); 1039 - const u64 next_bytenr = 1040 - le64_to_cpu(root_item->bytenr); 1041 + struct btrfs_root_item root_item; 1042 + u32 root_item_offset; 1043 + u64 next_bytenr; 1044 + 1045 + root_item_offset = item_offset + 1046 + offsetof(struct btrfs_leaf, items); 1047 + if (root_item_offset + 1048 + sizeof(struct btrfs_root_item) > 1049 + sf->block_ctx->len) 1050 + goto leaf_item_out_of_bounce_error; 1051 + btrfsic_read_from_block_data( 1052 + sf->block_ctx, &root_item, 1053 + root_item_offset, 1054 + sizeof(struct btrfs_root_item)); 1055 + next_bytenr = le64_to_cpu(root_item.bytenr); 1041 1056 1042 1057 sf->error = 1043 1058 btrfsic_create_link_to_next_block( ··· 1078 1041 &sf->num_copies, 1079 1042 &sf->mirror_num, 1080 1043 disk_key, 1081 - le64_to_cpu(root_item-> 1044 + le64_to_cpu(root_item. 1082 1045 generation)); 1083 1046 if (sf->error) 1084 1047 goto one_stack_frame_backwards; ··· 1086 1049 if (NULL != sf->next_block) { 1087 1050 struct btrfs_header *const next_hdr = 1088 1051 (struct btrfs_header *) 1089 - sf->next_block_ctx.data; 1052 + sf->next_block_ctx.datav[0]; 1090 1053 1091 1054 next_stack = 1092 1055 btrfsic_stack_frame_alloc(); ··· 1148 1111 } 1149 1112 1150 1113 if (sf->i < sf->nr) { 1151 - struct btrfs_key_ptr *disk_key_ptr = 1152 - nodehdr->ptrs + sf->i; 1153 - const u64 next_bytenr = 1154 - le64_to_cpu(disk_key_ptr->blockptr); 1114 + struct btrfs_key_ptr key_ptr; 1115 + u32 key_ptr_offset; 1116 + u64 next_bytenr; 1117 + 1118 + key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - 1119 + (uintptr_t)nodehdr; 1120 + if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > 1121 + sf->block_ctx->len) { 1122 + printk(KERN_INFO 1123 + "btrfsic: node item out of bounce at logical %llu, dev %s\n", 1124 + sf->block_ctx->start, 1125 + sf->block_ctx->dev->name); 1126 + goto one_stack_frame_backwards; 1127 + } 1128 + btrfsic_read_from_block_data( 1129 + sf->block_ctx, &key_ptr, key_ptr_offset, 1130 + sizeof(struct btrfs_key_ptr)); 1131 + next_bytenr = le64_to_cpu(key_ptr.blockptr); 1155 1132 1156 1133 sf->error = btrfsic_create_link_to_next_block( 1157 1134 state, ··· 1178 1127 force_iodone_flag, 1179 1128 &sf->num_copies, 1180 1129 &sf->mirror_num, 1181 - &disk_key_ptr->key, 1182 - le64_to_cpu(disk_key_ptr->generation)); 1130 + &key_ptr.key, 1131 + le64_to_cpu(key_ptr.generation)); 1183 1132 if (sf->error) 1184 1133 goto one_stack_frame_backwards; 1185 1134 1186 1135 if (NULL != sf->next_block) { 1187 1136 struct btrfs_header *const next_hdr = 1188 1137 (struct btrfs_header *) 1189 - sf->next_block_ctx.data; 1138 + sf->next_block_ctx.datav[0]; 1190 1139 1191 1140 next_stack = btrfsic_stack_frame_alloc(); 1192 1141 if (NULL == next_stack) ··· 1232 1181 return sf->error; 1233 1182 } 1234 1183 1184 + static void btrfsic_read_from_block_data( 1185 + struct btrfsic_block_data_ctx *block_ctx, 1186 + void *dstv, u32 offset, size_t len) 1187 + { 1188 + size_t cur; 1189 + size_t offset_in_page; 1190 + char *kaddr; 1191 + char *dst = (char *)dstv; 1192 + size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1); 1193 + unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; 1194 + 1195 + WARN_ON(offset + len > block_ctx->len); 1196 + offset_in_page = (start_offset + offset) & 1197 + ((unsigned long)PAGE_CACHE_SIZE - 1); 1198 + 1199 + while (len > 0) { 1200 + cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); 1201 + BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> 1202 + PAGE_CACHE_SHIFT); 1203 + kaddr = block_ctx->datav[i]; 1204 + memcpy(dst, kaddr + offset_in_page, cur); 1205 + 1206 + dst += cur; 1207 + len -= cur; 1208 + offset_in_page = 0; 1209 + i++; 1210 + } 1211 + } 1212 + 1235 1213 static int btrfsic_create_link_to_next_block( 1236 1214 struct btrfsic_state *state, 1237 1215 struct btrfsic_block *block, ··· 1284 1204 if (0 == *num_copiesp) { 1285 1205 *num_copiesp = 1286 1206 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1287 - next_bytenr, PAGE_SIZE); 1207 + next_bytenr, state->metablock_size); 1288 1208 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1289 1209 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1290 1210 (unsigned long long)next_bytenr, *num_copiesp); ··· 1299 1219 "btrfsic_create_link_to_next_block(mirror_num=%d)\n", 1300 1220 *mirror_nump); 1301 1221 ret = btrfsic_map_block(state, next_bytenr, 1302 - BTRFSIC_BLOCK_SIZE, 1222 + state->metablock_size, 1303 1223 next_block_ctx, *mirror_nump); 1304 1224 if (ret) { 1305 1225 printk(KERN_INFO ··· 1394 1314 1395 1315 if (limit_nesting > 0 && did_alloc_block_link) { 1396 1316 ret = btrfsic_read_block(state, next_block_ctx); 1397 - if (ret < (int)BTRFSIC_BLOCK_SIZE) { 1317 + if (ret < (int)next_block_ctx->len) { 1398 1318 printk(KERN_INFO 1399 1319 "btrfsic: read block @logical %llu failed!\n", 1400 1320 (unsigned long long)next_bytenr); ··· 1419 1339 u32 item_offset, int force_iodone_flag) 1420 1340 { 1421 1341 int ret; 1422 - struct btrfs_file_extent_item *file_extent_item = 1423 - (struct btrfs_file_extent_item *)(block_ctx->data + 1424 - offsetof(struct btrfs_leaf, 1425 - items) + item_offset); 1426 - u64 next_bytenr = 1427 - le64_to_cpu(file_extent_item->disk_bytenr) + 1428 - le64_to_cpu(file_extent_item->offset); 1429 - u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); 1430 - u64 generation = le64_to_cpu(file_extent_item->generation); 1342 + struct btrfs_file_extent_item file_extent_item; 1343 + u64 file_extent_item_offset; 1344 + u64 next_bytenr; 1345 + u64 num_bytes; 1346 + u64 generation; 1431 1347 struct btrfsic_block_link *l; 1348 + 1349 + file_extent_item_offset = offsetof(struct btrfs_leaf, items) + 1350 + item_offset; 1351 + if (file_extent_item_offset + 1352 + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > 1353 + block_ctx->len) { 1354 + printk(KERN_INFO 1355 + "btrfsic: file item out of bounce at logical %llu, dev %s\n", 1356 + block_ctx->start, block_ctx->dev->name); 1357 + return -1; 1358 + } 1359 + 1360 + btrfsic_read_from_block_data(block_ctx, &file_extent_item, 1361 + file_extent_item_offset, 1362 + offsetof(struct btrfs_file_extent_item, disk_num_bytes)); 1363 + if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || 1364 + ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { 1365 + if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) 1366 + printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", 1367 + file_extent_item.type, 1368 + (unsigned long long) 1369 + le64_to_cpu(file_extent_item.disk_bytenr)); 1370 + return 0; 1371 + } 1372 + 1373 + if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > 1374 + block_ctx->len) { 1375 + printk(KERN_INFO 1376 + "btrfsic: file item out of bounce at logical %llu, dev %s\n", 1377 + block_ctx->start, block_ctx->dev->name); 1378 + return -1; 1379 + } 1380 + btrfsic_read_from_block_data(block_ctx, &file_extent_item, 1381 + file_extent_item_offset, 1382 + sizeof(struct btrfs_file_extent_item)); 1383 + next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + 1384 + le64_to_cpu(file_extent_item.offset); 1385 + generation = le64_to_cpu(file_extent_item.generation); 1386 + num_bytes = le64_to_cpu(file_extent_item.num_bytes); 1387 + generation = le64_to_cpu(file_extent_item.generation); 1432 1388 1433 1389 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) 1434 1390 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," 1435 1391 " offset = %llu, num_bytes = %llu\n", 1436 - file_extent_item->type, 1392 + file_extent_item.type, 1437 1393 (unsigned long long) 1438 - le64_to_cpu(file_extent_item->disk_bytenr), 1439 - (unsigned long long) 1440 - le64_to_cpu(file_extent_item->offset), 1441 - (unsigned long long) 1442 - le64_to_cpu(file_extent_item->num_bytes)); 1443 - if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || 1444 - ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) 1445 - return 0; 1394 + le64_to_cpu(file_extent_item.disk_bytenr), 1395 + (unsigned long long)le64_to_cpu(file_extent_item.offset), 1396 + (unsigned long long)num_bytes); 1446 1397 while (num_bytes > 0) { 1447 1398 u32 chunk_len; 1448 1399 int num_copies; 1449 1400 int mirror_num; 1450 1401 1451 - if (num_bytes > BTRFSIC_BLOCK_SIZE) 1452 - chunk_len = BTRFSIC_BLOCK_SIZE; 1402 + if (num_bytes > state->datablock_size) 1403 + chunk_len = state->datablock_size; 1453 1404 else 1454 1405 chunk_len = num_bytes; 1455 1406 1456 1407 num_copies = 1457 1408 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1458 - next_bytenr, PAGE_SIZE); 1409 + next_bytenr, state->datablock_size); 1459 1410 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1460 1411 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1461 1412 (unsigned long long)next_bytenr, num_copies); ··· 1586 1475 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1587 1476 block_ctx_out->start = bytenr; 1588 1477 block_ctx_out->len = len; 1589 - block_ctx_out->data = NULL; 1590 - block_ctx_out->bh = NULL; 1478 + block_ctx_out->datav = NULL; 1479 + block_ctx_out->pagev = NULL; 1480 + block_ctx_out->mem_to_free = NULL; 1591 1481 1592 1482 if (0 == ret) 1593 1483 kfree(multi); ··· 1608 1496 block_ctx_out->dev_bytenr = bytenr; 1609 1497 block_ctx_out->start = bytenr; 1610 1498 block_ctx_out->len = len; 1611 - block_ctx_out->data = NULL; 1612 - block_ctx_out->bh = NULL; 1499 + block_ctx_out->datav = NULL; 1500 + block_ctx_out->pagev = NULL; 1501 + block_ctx_out->mem_to_free = NULL; 1613 1502 if (NULL != block_ctx_out->dev) { 1614 1503 return 0; 1615 1504 } else { ··· 1621 1508 1622 1509 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1623 1510 { 1624 - if (NULL != block_ctx->bh) { 1625 - brelse(block_ctx->bh); 1626 - block_ctx->bh = NULL; 1511 + if (block_ctx->mem_to_free) { 1512 + unsigned int num_pages; 1513 + 1514 + BUG_ON(!block_ctx->datav); 1515 + BUG_ON(!block_ctx->pagev); 1516 + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> 1517 + PAGE_CACHE_SHIFT; 1518 + while (num_pages > 0) { 1519 + num_pages--; 1520 + if (block_ctx->datav[num_pages]) { 1521 + kunmap(block_ctx->pagev[num_pages]); 1522 + block_ctx->datav[num_pages] = NULL; 1523 + } 1524 + if (block_ctx->pagev[num_pages]) { 1525 + __free_page(block_ctx->pagev[num_pages]); 1526 + block_ctx->pagev[num_pages] = NULL; 1527 + } 1528 + } 1529 + 1530 + kfree(block_ctx->mem_to_free); 1531 + block_ctx->mem_to_free = NULL; 1532 + block_ctx->pagev = NULL; 1533 + block_ctx->datav = NULL; 1627 1534 } 1628 1535 } 1629 1536 1630 1537 static int btrfsic_read_block(struct btrfsic_state *state, 1631 1538 struct btrfsic_block_data_ctx *block_ctx) 1632 1539 { 1633 - block_ctx->bh = NULL; 1634 - if (block_ctx->dev_bytenr & 4095) { 1540 + unsigned int num_pages; 1541 + unsigned int i; 1542 + u64 dev_bytenr; 1543 + int ret; 1544 + 1545 + BUG_ON(block_ctx->datav); 1546 + BUG_ON(block_ctx->pagev); 1547 + BUG_ON(block_ctx->mem_to_free); 1548 + if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { 1635 1549 printk(KERN_INFO 1636 1550 "btrfsic: read_block() with unaligned bytenr %llu\n", 1637 1551 (unsigned long long)block_ctx->dev_bytenr); 1638 1552 return -1; 1639 1553 } 1640 - if (block_ctx->len > 4096) { 1641 - printk(KERN_INFO 1642 - "btrfsic: read_block() with too huge size %d\n", 1643 - block_ctx->len); 1554 + 1555 + num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >> 1556 + PAGE_CACHE_SHIFT; 1557 + block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) + 1558 + sizeof(*block_ctx->pagev)) * 1559 + num_pages, GFP_NOFS); 1560 + if (!block_ctx->mem_to_free) 1644 1561 return -1; 1562 + block_ctx->datav = block_ctx->mem_to_free; 1563 + block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); 1564 + for (i = 0; i < num_pages; i++) { 1565 + block_ctx->pagev[i] = alloc_page(GFP_NOFS); 1566 + if (!block_ctx->pagev[i]) 1567 + return -1; 1645 1568 } 1646 1569 1647 - block_ctx->bh = __bread(block_ctx->dev->bdev, 1648 - block_ctx->dev_bytenr >> 12, 4096); 1649 - if (NULL == block_ctx->bh) 1650 - return -1; 1651 - block_ctx->data = block_ctx->bh->b_data; 1570 + dev_bytenr = block_ctx->dev_bytenr; 1571 + for (i = 0; i < num_pages;) { 1572 + struct bio *bio; 1573 + unsigned int j; 1574 + DECLARE_COMPLETION_ONSTACK(complete); 1575 + 1576 + bio = bio_alloc(GFP_NOFS, num_pages - i); 1577 + if (!bio) { 1578 + printk(KERN_INFO 1579 + "btrfsic: bio_alloc() for %u pages failed!\n", 1580 + num_pages - i); 1581 + return -1; 1582 + } 1583 + bio->bi_bdev = block_ctx->dev->bdev; 1584 + bio->bi_sector = dev_bytenr >> 9; 1585 + bio->bi_end_io = btrfsic_complete_bio_end_io; 1586 + bio->bi_private = &complete; 1587 + 1588 + for (j = i; j < num_pages; j++) { 1589 + ret = bio_add_page(bio, block_ctx->pagev[j], 1590 + PAGE_CACHE_SIZE, 0); 1591 + if (PAGE_CACHE_SIZE != ret) 1592 + break; 1593 + } 1594 + if (j == i) { 1595 + printk(KERN_INFO 1596 + "btrfsic: error, failed to add a single page!\n"); 1597 + return -1; 1598 + } 1599 + submit_bio(READ, bio); 1600 + 1601 + /* this will also unplug the queue */ 1602 + wait_for_completion(&complete); 1603 + 1604 + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1605 + printk(KERN_INFO 1606 + "btrfsic: read error at logical %llu dev %s!\n", 1607 + block_ctx->start, block_ctx->dev->name); 1608 + bio_put(bio); 1609 + return -1; 1610 + } 1611 + bio_put(bio); 1612 + dev_bytenr += (j - i) * PAGE_CACHE_SIZE; 1613 + i = j; 1614 + } 1615 + for (i = 0; i < num_pages; i++) { 1616 + block_ctx->datav[i] = kmap(block_ctx->pagev[i]); 1617 + if (!block_ctx->datav[i]) { 1618 + printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n", 1619 + block_ctx->dev->name); 1620 + return -1; 1621 + } 1622 + } 1652 1623 1653 1624 return block_ctx->len; 1625 + } 1626 + 1627 + static void btrfsic_complete_bio_end_io(struct bio *bio, int err) 1628 + { 1629 + complete((struct completion *)bio->bi_private); 1654 1630 } 1655 1631 1656 1632 static void btrfsic_dump_database(struct btrfsic_state *state) ··· 1819 1617 * (note that this test fails for the super block) 1820 1618 */ 1821 1619 static int btrfsic_test_for_metadata(struct btrfsic_state *state, 1822 - const u8 *data, unsigned int size) 1620 + char **datav, unsigned int num_pages) 1823 1621 { 1824 1622 struct btrfs_header *h; 1825 1623 u8 csum[BTRFS_CSUM_SIZE]; 1826 1624 u32 crc = ~(u32)0; 1827 - int fail = 0; 1828 - int crc_fail = 0; 1625 + unsigned int i; 1829 1626 1830 - h = (struct btrfs_header *)data; 1627 + if (num_pages * PAGE_CACHE_SIZE < state->metablock_size) 1628 + return 1; /* not metadata */ 1629 + num_pages = state->metablock_size >> PAGE_CACHE_SHIFT; 1630 + h = (struct btrfs_header *)datav[0]; 1831 1631 1832 1632 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) 1833 - fail++; 1633 + return 1; 1834 1634 1835 - crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); 1635 + for (i = 0; i < num_pages; i++) { 1636 + u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); 1637 + size_t sublen = i ? PAGE_CACHE_SIZE : 1638 + (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); 1639 + 1640 + crc = crc32c(crc, data, sublen); 1641 + } 1836 1642 btrfs_csum_final(crc, csum); 1837 1643 if (memcmp(csum, h->csum, state->csum_size)) 1838 - crc_fail++; 1644 + return 1; 1839 1645 1840 - return fail || crc_fail; 1646 + return 0; /* is metadata */ 1841 1647 } 1842 1648 1843 1649 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 1844 - u64 dev_bytenr, 1845 - u8 *mapped_data, unsigned int len, 1846 - struct bio *bio, 1847 - int *bio_is_patched, 1650 + u64 dev_bytenr, char **mapped_datav, 1651 + unsigned int num_pages, 1652 + struct bio *bio, int *bio_is_patched, 1848 1653 struct buffer_head *bh, 1849 1654 int submit_bio_bh_rw) 1850 1655 { ··· 1861 1652 int ret; 1862 1653 struct btrfsic_state *state = dev_state->state; 1863 1654 struct block_device *bdev = dev_state->bdev; 1655 + unsigned int processed_len; 1864 1656 1865 - WARN_ON(len > PAGE_SIZE); 1866 - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); 1867 1657 if (NULL != bio_is_patched) 1868 1658 *bio_is_patched = 0; 1659 + 1660 + again: 1661 + if (num_pages == 0) 1662 + return; 1663 + 1664 + processed_len = 0; 1665 + is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, 1666 + num_pages)); 1869 1667 1870 1668 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, 1871 1669 &state->block_hashtable); ··· 1883 1667 1884 1668 if (block->is_superblock) { 1885 1669 bytenr = le64_to_cpu(((struct btrfs_super_block *) 1886 - mapped_data)->bytenr); 1670 + mapped_datav[0])->bytenr); 1671 + if (num_pages * PAGE_CACHE_SIZE < 1672 + BTRFS_SUPER_INFO_SIZE) { 1673 + printk(KERN_INFO 1674 + "btrfsic: cannot work with too short bios!\n"); 1675 + return; 1676 + } 1887 1677 is_metadata = 1; 1678 + BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1)); 1679 + processed_len = BTRFS_SUPER_INFO_SIZE; 1888 1680 if (state->print_mask & 1889 1681 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { 1890 1682 printk(KERN_INFO ··· 1902 1678 } 1903 1679 if (is_metadata) { 1904 1680 if (!block->is_superblock) { 1681 + if (num_pages * PAGE_CACHE_SIZE < 1682 + state->metablock_size) { 1683 + printk(KERN_INFO 1684 + "btrfsic: cannot work with too short bios!\n"); 1685 + return; 1686 + } 1687 + processed_len = state->metablock_size; 1905 1688 bytenr = le64_to_cpu(((struct btrfs_header *) 1906 - mapped_data)->bytenr); 1689 + mapped_datav[0])->bytenr); 1907 1690 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, 1908 1691 dev_state, 1909 - dev_bytenr, 1910 - mapped_data); 1692 + dev_bytenr); 1911 1693 } 1912 1694 if (block->logical_bytenr != bytenr) { 1913 1695 printk(KERN_INFO ··· 1940 1710 block->mirror_num, 1941 1711 btrfsic_get_block_type(state, block)); 1942 1712 } else { 1713 + if (num_pages * PAGE_CACHE_SIZE < 1714 + state->datablock_size) { 1715 + printk(KERN_INFO 1716 + "btrfsic: cannot work with too short bios!\n"); 1717 + return; 1718 + } 1719 + processed_len = state->datablock_size; 1943 1720 bytenr = block->logical_bytenr; 1944 1721 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1945 1722 printk(KERN_INFO ··· 1984 1747 le64_to_cpu(block->disk_key.offset), 1985 1748 (unsigned long long) 1986 1749 le64_to_cpu(((struct btrfs_header *) 1987 - mapped_data)->generation), 1750 + mapped_datav[0])->generation), 1988 1751 (unsigned long long) 1989 1752 state->max_superblock_generation); 1990 1753 btrfsic_dump_tree(state); ··· 2002 1765 (unsigned long long)block->generation, 2003 1766 (unsigned long long) 2004 1767 le64_to_cpu(((struct btrfs_header *) 2005 - mapped_data)->generation)); 1768 + mapped_datav[0])->generation)); 2006 1769 /* it would not be safe to go on */ 2007 1770 btrfsic_dump_tree(state); 2008 - return; 1771 + goto continue_loop; 2009 1772 } 2010 1773 2011 1774 /* ··· 2033 1796 } 2034 1797 2035 1798 if (block->is_superblock) 2036 - ret = btrfsic_map_superblock(state, bytenr, len, 1799 + ret = btrfsic_map_superblock(state, bytenr, 1800 + processed_len, 2037 1801 bdev, &block_ctx); 2038 1802 else 2039 - ret = btrfsic_map_block(state, bytenr, len, 1803 + ret = btrfsic_map_block(state, bytenr, processed_len, 2040 1804 &block_ctx, 0); 2041 1805 if (ret) { 2042 1806 printk(KERN_INFO 2043 1807 "btrfsic: btrfsic_map_block(root @%llu)" 2044 1808 " failed!\n", (unsigned long long)bytenr); 2045 - return; 1809 + goto continue_loop; 2046 1810 } 2047 - block_ctx.data = mapped_data; 1811 + block_ctx.datav = mapped_datav; 2048 1812 /* the following is required in case of writes to mirrors, 2049 1813 * use the same that was used for the lookup */ 2050 1814 block_ctx.dev = dev_state; ··· 2101 1863 block->logical_bytenr = bytenr; 2102 1864 block->is_metadata = 1; 2103 1865 if (block->is_superblock) { 1866 + BUG_ON(PAGE_CACHE_SIZE != 1867 + BTRFS_SUPER_INFO_SIZE); 2104 1868 ret = btrfsic_process_written_superblock( 2105 1869 state, 2106 1870 block, 2107 1871 (struct btrfs_super_block *) 2108 - mapped_data); 1872 + mapped_datav[0]); 2109 1873 if (state->print_mask & 2110 1874 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { 2111 1875 printk(KERN_INFO ··· 2120 1880 state, 2121 1881 block, 2122 1882 &block_ctx, 2123 - (struct btrfs_header *) 2124 - block_ctx.data, 2125 1883 0, 0); 2126 1884 } 2127 1885 if (ret) ··· 2150 1912 u64 bytenr; 2151 1913 2152 1914 if (!is_metadata) { 1915 + processed_len = state->datablock_size; 2153 1916 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2154 1917 printk(KERN_INFO "Written block (%s/%llu/?)" 2155 1918 " !found in hash table, D.\n", 2156 1919 dev_state->name, 2157 1920 (unsigned long long)dev_bytenr); 2158 - if (!state->include_extent_data) 2159 - return; /* ignore that written D block */ 1921 + if (!state->include_extent_data) { 1922 + /* ignore that written D block */ 1923 + goto continue_loop; 1924 + } 2160 1925 2161 1926 /* this is getting ugly for the 2162 1927 * include_extent_data case... */ 2163 1928 bytenr = 0; /* unknown */ 2164 1929 block_ctx.start = bytenr; 2165 - block_ctx.len = len; 2166 - block_ctx.bh = NULL; 1930 + block_ctx.len = processed_len; 1931 + block_ctx.mem_to_free = NULL; 1932 + block_ctx.pagev = NULL; 2167 1933 } else { 1934 + processed_len = state->metablock_size; 2168 1935 bytenr = le64_to_cpu(((struct btrfs_header *) 2169 - mapped_data)->bytenr); 1936 + mapped_datav[0])->bytenr); 2170 1937 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, 2171 - dev_bytenr, 2172 - mapped_data); 1938 + dev_bytenr); 2173 1939 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 2174 1940 printk(KERN_INFO 2175 1941 "Written block @%llu (%s/%llu/?)" ··· 2182 1940 dev_state->name, 2183 1941 (unsigned long long)dev_bytenr); 2184 1942 2185 - ret = btrfsic_map_block(state, bytenr, len, &block_ctx, 2186 - 0); 1943 + ret = btrfsic_map_block(state, bytenr, processed_len, 1944 + &block_ctx, 0); 2187 1945 if (ret) { 2188 1946 printk(KERN_INFO 2189 1947 "btrfsic: btrfsic_map_block(root @%llu)" 2190 1948 " failed!\n", 2191 1949 (unsigned long long)dev_bytenr); 2192 - return; 1950 + goto continue_loop; 2193 1951 } 2194 1952 } 2195 - block_ctx.data = mapped_data; 1953 + block_ctx.datav = mapped_datav; 2196 1954 /* the following is required in case of writes to mirrors, 2197 1955 * use the same that was used for the lookup */ 2198 1956 block_ctx.dev = dev_state; ··· 2202 1960 if (NULL == block) { 2203 1961 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 2204 1962 btrfsic_release_block_ctx(&block_ctx); 2205 - return; 1963 + goto continue_loop; 2206 1964 } 2207 1965 block->dev_state = dev_state; 2208 1966 block->dev_bytenr = dev_bytenr; ··· 2262 2020 2263 2021 if (is_metadata) { 2264 2022 ret = btrfsic_process_metablock(state, block, 2265 - &block_ctx, 2266 - (struct btrfs_header *) 2267 - block_ctx.data, 0, 0); 2023 + &block_ctx, 0, 0); 2268 2024 if (ret) 2269 2025 printk(KERN_INFO 2270 2026 "btrfsic: process_metablock(root @%llu)" ··· 2271 2031 } 2272 2032 btrfsic_release_block_ctx(&block_ctx); 2273 2033 } 2034 + 2035 + continue_loop: 2036 + BUG_ON(!processed_len); 2037 + dev_bytenr += processed_len; 2038 + mapped_datav += processed_len >> PAGE_CACHE_SHIFT; 2039 + num_pages -= processed_len >> PAGE_CACHE_SHIFT; 2040 + goto again; 2274 2041 } 2275 2042 2276 2043 static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) ··· 2460 2213 2461 2214 num_copies = 2462 2215 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2463 - next_bytenr, PAGE_SIZE); 2216 + next_bytenr, BTRFS_SUPER_INFO_SIZE); 2464 2217 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2465 2218 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2466 2219 (unsigned long long)next_bytenr, num_copies); ··· 2471 2224 printk(KERN_INFO 2472 2225 "btrfsic_process_written_superblock(" 2473 2226 "mirror_num=%d)\n", mirror_num); 2474 - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, 2227 + ret = btrfsic_map_block(state, next_bytenr, 2228 + BTRFS_SUPER_INFO_SIZE, 2475 2229 &tmp_next_block_ctx, 2476 2230 mirror_num); 2477 2231 if (ret) { ··· 2937 2689 static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, 2938 2690 u64 bytenr, 2939 2691 struct btrfsic_dev_state *dev_state, 2940 - u64 dev_bytenr, char *data) 2692 + u64 dev_bytenr) 2941 2693 { 2942 2694 int num_copies; 2943 2695 int mirror_num; ··· 2946 2698 int match = 0; 2947 2699 2948 2700 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2949 - bytenr, PAGE_SIZE); 2701 + bytenr, state->metablock_size); 2950 2702 2951 2703 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2952 - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2704 + ret = btrfsic_map_block(state, bytenr, state->metablock_size, 2953 2705 &block_ctx, mirror_num); 2954 2706 if (ret) { 2955 2707 printk(KERN_INFO "btrfsic:" ··· 2975 2727 (unsigned long long)bytenr, dev_state->name, 2976 2728 (unsigned long long)dev_bytenr); 2977 2729 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2978 - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, 2730 + ret = btrfsic_map_block(state, bytenr, 2731 + state->metablock_size, 2979 2732 &block_ctx, mirror_num); 2980 2733 if (ret) 2981 2734 continue; ··· 3030 2781 (unsigned long)bh->b_size, bh->b_data, 3031 2782 bh->b_bdev); 3032 2783 btrfsic_process_written_block(dev_state, dev_bytenr, 3033 - bh->b_data, bh->b_size, NULL, 2784 + &bh->b_data, 1, NULL, 3034 2785 NULL, bh, rw); 3035 2786 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3036 2787 if (dev_state->state->print_mask & 3037 2788 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3038 2789 printk(KERN_INFO 3039 - "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", 2790 + "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", 3040 2791 rw, bh->b_bdev); 3041 2792 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3042 2793 if ((dev_state->state->print_mask & ··· 3085 2836 unsigned int i; 3086 2837 u64 dev_bytenr; 3087 2838 int bio_is_patched; 2839 + char **mapped_datav; 3088 2840 3089 2841 dev_bytenr = 512 * bio->bi_sector; 3090 2842 bio_is_patched = 0; ··· 3098 2848 (unsigned long long)dev_bytenr, 3099 2849 bio->bi_bdev); 3100 2850 2851 + mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, 2852 + GFP_NOFS); 2853 + if (!mapped_datav) 2854 + goto leave; 3101 2855 for (i = 0; i < bio->bi_vcnt; i++) { 3102 - u8 *mapped_data; 3103 - 3104 - mapped_data = kmap(bio->bi_io_vec[i].bv_page); 2856 + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); 2857 + mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); 2858 + if (!mapped_datav[i]) { 2859 + while (i > 0) { 2860 + i--; 2861 + kunmap(bio->bi_io_vec[i].bv_page); 2862 + } 2863 + kfree(mapped_datav); 2864 + goto leave; 2865 + } 3105 2866 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3106 2867 BTRFSIC_PRINT_MASK_VERBOSE) == 3107 2868 (dev_state->state->print_mask & 3108 2869 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3109 2870 BTRFSIC_PRINT_MASK_VERBOSE))) 3110 2871 printk(KERN_INFO 3111 - "#%u: page=%p, mapped=%p, len=%u," 3112 - " offset=%u\n", 2872 + "#%u: page=%p, len=%u, offset=%u\n", 3113 2873 i, bio->bi_io_vec[i].bv_page, 3114 - mapped_data, 3115 2874 bio->bi_io_vec[i].bv_len, 3116 2875 bio->bi_io_vec[i].bv_offset); 3117 - btrfsic_process_written_block(dev_state, dev_bytenr, 3118 - mapped_data, 3119 - bio->bi_io_vec[i].bv_len, 3120 - bio, &bio_is_patched, 3121 - NULL, rw); 3122 - kunmap(bio->bi_io_vec[i].bv_page); 3123 - dev_bytenr += bio->bi_io_vec[i].bv_len; 3124 2876 } 2877 + btrfsic_process_written_block(dev_state, dev_bytenr, 2878 + mapped_datav, bio->bi_vcnt, 2879 + bio, &bio_is_patched, 2880 + NULL, rw); 2881 + while (i > 0) { 2882 + i--; 2883 + kunmap(bio->bi_io_vec[i].bv_page); 2884 + } 2885 + kfree(mapped_datav); 3125 2886 } else if (NULL != dev_state && (rw & REQ_FLUSH)) { 3126 2887 if (dev_state->state->print_mask & 3127 2888 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3128 2889 printk(KERN_INFO 3129 - "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", 2890 + "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", 3130 2891 rw, bio->bi_bdev); 3131 2892 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { 3132 2893 if ((dev_state->state->print_mask & ··· 3164 2903 bio->bi_end_io = btrfsic_bio_end_io; 3165 2904 } 3166 2905 } 2906 + leave: 3167 2907 mutex_unlock(&btrfsic_mutex); 3168 2908 3169 2909 submit_bio(rw, bio); ··· 3179 2917 struct list_head *dev_head = &fs_devices->devices; 3180 2918 struct btrfs_device *device; 3181 2919 2920 + if (root->nodesize != root->leafsize) { 2921 + printk(KERN_INFO 2922 + "btrfsic: cannot handle nodesize %d != leafsize %d!\n", 2923 + root->nodesize, root->leafsize); 2924 + return -1; 2925 + } 2926 + if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { 2927 + printk(KERN_INFO 2928 + "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 2929 + root->nodesize, (unsigned long)PAGE_CACHE_SIZE); 2930 + return -1; 2931 + } 2932 + if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { 2933 + printk(KERN_INFO 2934 + "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 2935 + root->leafsize, (unsigned long)PAGE_CACHE_SIZE); 2936 + return -1; 2937 + } 2938 + if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { 2939 + printk(KERN_INFO 2940 + "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 2941 + root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); 2942 + return -1; 2943 + } 3182 2944 state = kzalloc(sizeof(*state), GFP_NOFS); 3183 2945 if (NULL == state) { 3184 2946 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); ··· 3219 2933 state->print_mask = print_mask; 3220 2934 state->include_extent_data = including_extent_data; 3221 2935 state->csum_size = 0; 2936 + state->metablock_size = root->nodesize; 2937 + state->datablock_size = root->sectorsize; 3222 2938 INIT_LIST_HEAD(&state->all_blocks_list); 3223 2939 btrfsic_block_hashtable_init(&state->block_hashtable); 3224 2940 btrfsic_block_link_hashtable_init(&state->block_link_hashtable); ··· 3337 3049 btrfsic_block_link_free(l); 3338 3050 } 3339 3051 3340 - if (b_all->is_iodone) 3052 + if (b_all->is_iodone || b_all->never_written) 3341 3053 btrfsic_block_free(b_all); 3342 3054 else 3343 3055 printk(KERN_INFO "btrfs: attempt to free %c-block"
+830 -31
fs/btrfs/ctree.c
··· 18 18 19 19 #include <linux/sched.h> 20 20 #include <linux/slab.h> 21 + #include <linux/rbtree.h> 21 22 #include "ctree.h" 22 23 #include "disk-io.h" 23 24 #include "transaction.h" ··· 38 37 struct extent_buffer *dst_buf, 39 38 struct extent_buffer *src_buf); 40 39 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 41 - struct btrfs_path *path, int level, int slot); 40 + struct btrfs_path *path, int level, int slot, 41 + int tree_mod_log); 42 + static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 43 + struct extent_buffer *eb); 44 + struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 45 + u32 blocksize, u64 parent_transid, 46 + u64 time_seq); 47 + struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, 48 + u64 bytenr, u32 blocksize, 49 + u64 time_seq); 42 50 43 51 struct btrfs_path *btrfs_alloc_path(void) 44 52 { ··· 265 255 266 256 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 267 257 new_root_objectid, &disk_key, level, 268 - buf->start, 0, 1); 258 + buf->start, 0); 269 259 if (IS_ERR(cow)) 270 260 return PTR_ERR(cow); 271 261 ··· 296 286 btrfs_mark_buffer_dirty(cow); 297 287 *cow_ret = cow; 298 288 return 0; 289 + } 290 + 291 + enum mod_log_op { 292 + MOD_LOG_KEY_REPLACE, 293 + MOD_LOG_KEY_ADD, 294 + MOD_LOG_KEY_REMOVE, 295 + MOD_LOG_KEY_REMOVE_WHILE_FREEING, 296 + MOD_LOG_KEY_REMOVE_WHILE_MOVING, 297 + MOD_LOG_MOVE_KEYS, 298 + MOD_LOG_ROOT_REPLACE, 299 + }; 300 + 301 + struct tree_mod_move { 302 + int dst_slot; 303 + int nr_items; 304 + }; 305 + 306 + struct tree_mod_root { 307 + u64 logical; 308 + u8 level; 309 + }; 310 + 311 + struct tree_mod_elem { 312 + struct rb_node node; 313 + u64 index; /* shifted logical */ 314 + struct seq_list elem; 315 + enum mod_log_op op; 316 + 317 + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ 318 + int slot; 319 + 320 + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ 321 + u64 generation; 322 + 323 + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ 324 + struct btrfs_disk_key key; 325 + u64 blockptr; 326 + 327 + /* this is used for op == MOD_LOG_MOVE_KEYS */ 328 + struct tree_mod_move move; 329 + 330 + /* this is used for op == MOD_LOG_ROOT_REPLACE */ 331 + struct tree_mod_root old_root; 332 + }; 333 + 334 + static inline void 335 + __get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) 336 + { 337 + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq); 338 + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 339 + } 340 + 341 + void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 342 + struct seq_list *elem) 343 + { 344 + elem->flags = 1; 345 + spin_lock(&fs_info->tree_mod_seq_lock); 346 + __get_tree_mod_seq(fs_info, elem); 347 + spin_unlock(&fs_info->tree_mod_seq_lock); 348 + } 349 + 350 + void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 351 + struct seq_list *elem) 352 + { 353 + struct rb_root *tm_root; 354 + struct rb_node *node; 355 + struct rb_node *next; 356 + struct seq_list *cur_elem; 357 + struct tree_mod_elem *tm; 358 + u64 min_seq = (u64)-1; 359 + u64 seq_putting = elem->seq; 360 + 361 + if (!seq_putting) 362 + return; 363 + 364 + BUG_ON(!(elem->flags & 1)); 365 + spin_lock(&fs_info->tree_mod_seq_lock); 366 + list_del(&elem->list); 367 + 368 + list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) { 369 + if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) { 370 + if (seq_putting > cur_elem->seq) { 371 + /* 372 + * blocker with lower sequence number exists, we 373 + * cannot remove anything from the log 374 + */ 375 + goto out; 376 + } 377 + min_seq = cur_elem->seq; 378 + } 379 + } 380 + 381 + /* 382 + * anything that's lower than the lowest existing (read: blocked) 383 + * sequence number can be removed from the tree. 384 + */ 385 + write_lock(&fs_info->tree_mod_log_lock); 386 + tm_root = &fs_info->tree_mod_log; 387 + for (node = rb_first(tm_root); node; node = next) { 388 + next = rb_next(node); 389 + tm = container_of(node, struct tree_mod_elem, node); 390 + if (tm->elem.seq > min_seq) 391 + continue; 392 + rb_erase(node, tm_root); 393 + list_del(&tm->elem.list); 394 + kfree(tm); 395 + } 396 + write_unlock(&fs_info->tree_mod_log_lock); 397 + out: 398 + spin_unlock(&fs_info->tree_mod_seq_lock); 399 + } 400 + 401 + /* 402 + * key order of the log: 403 + * index -> sequence 404 + * 405 + * the index is the shifted logical of the *new* root node for root replace 406 + * operations, or the shifted logical of the affected block for all other 407 + * operations. 408 + */ 409 + static noinline int 410 + __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 411 + { 412 + struct rb_root *tm_root; 413 + struct rb_node **new; 414 + struct rb_node *parent = NULL; 415 + struct tree_mod_elem *cur; 416 + int ret = 0; 417 + 418 + BUG_ON(!tm || !tm->elem.seq); 419 + 420 + write_lock(&fs_info->tree_mod_log_lock); 421 + tm_root = &fs_info->tree_mod_log; 422 + new = &tm_root->rb_node; 423 + while (*new) { 424 + cur = container_of(*new, struct tree_mod_elem, node); 425 + parent = *new; 426 + if (cur->index < tm->index) 427 + new = &((*new)->rb_left); 428 + else if (cur->index > tm->index) 429 + new = &((*new)->rb_right); 430 + else if (cur->elem.seq < tm->elem.seq) 431 + new = &((*new)->rb_left); 432 + else if (cur->elem.seq > tm->elem.seq) 433 + new = &((*new)->rb_right); 434 + else { 435 + kfree(tm); 436 + ret = -EEXIST; 437 + goto unlock; 438 + } 439 + } 440 + 441 + rb_link_node(&tm->node, parent, new); 442 + rb_insert_color(&tm->node, tm_root); 443 + unlock: 444 + write_unlock(&fs_info->tree_mod_log_lock); 445 + return ret; 446 + } 447 + 448 + static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, 449 + struct extent_buffer *eb) { 450 + smp_mb(); 451 + if (list_empty(&(fs_info)->tree_mod_seq_list)) 452 + return 1; 453 + if (!eb) 454 + return 0; 455 + if (btrfs_header_level(eb) == 0) 456 + return 1; 457 + return 0; 458 + } 459 + 460 + static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, 461 + struct tree_mod_elem **tm_ret) 462 + { 463 + struct tree_mod_elem *tm; 464 + int seq; 465 + 466 + if (tree_mod_dont_log(fs_info, NULL)) 467 + return 0; 468 + 469 + tm = *tm_ret = kzalloc(sizeof(*tm), flags); 470 + if (!tm) 471 + return -ENOMEM; 472 + 473 + tm->elem.flags = 0; 474 + spin_lock(&fs_info->tree_mod_seq_lock); 475 + if (list_empty(&fs_info->tree_mod_seq_list)) { 476 + /* 477 + * someone emptied the list while we were waiting for the lock. 478 + * we must not add to the list, because no blocker exists. items 479 + * are removed from the list only when the existing blocker is 480 + * removed from the list. 481 + */ 482 + kfree(tm); 483 + seq = 0; 484 + } else { 485 + __get_tree_mod_seq(fs_info, &tm->elem); 486 + seq = tm->elem.seq; 487 + } 488 + spin_unlock(&fs_info->tree_mod_seq_lock); 489 + 490 + return seq; 491 + } 492 + 493 + static noinline int 494 + tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, 495 + struct extent_buffer *eb, int slot, 496 + enum mod_log_op op, gfp_t flags) 497 + { 498 + struct tree_mod_elem *tm; 499 + int ret; 500 + 501 + ret = tree_mod_alloc(fs_info, flags, &tm); 502 + if (ret <= 0) 503 + return ret; 504 + 505 + tm->index = eb->start >> PAGE_CACHE_SHIFT; 506 + if (op != MOD_LOG_KEY_ADD) { 507 + btrfs_node_key(eb, &tm->key, slot); 508 + tm->blockptr = btrfs_node_blockptr(eb, slot); 509 + } 510 + tm->op = op; 511 + tm->slot = slot; 512 + tm->generation = btrfs_node_ptr_generation(eb, slot); 513 + 514 + return __tree_mod_log_insert(fs_info, tm); 515 + } 516 + 517 + static noinline int 518 + tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 519 + int slot, enum mod_log_op op) 520 + { 521 + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); 522 + } 523 + 524 + static noinline int 525 + tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, 526 + struct extent_buffer *eb, int dst_slot, int src_slot, 527 + int nr_items, gfp_t flags) 528 + { 529 + struct tree_mod_elem *tm; 530 + int ret; 531 + int i; 532 + 533 + if (tree_mod_dont_log(fs_info, eb)) 534 + return 0; 535 + 536 + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 537 + ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot, 538 + MOD_LOG_KEY_REMOVE_WHILE_MOVING); 539 + BUG_ON(ret < 0); 540 + } 541 + 542 + ret = tree_mod_alloc(fs_info, flags, &tm); 543 + if (ret <= 0) 544 + return ret; 545 + 546 + tm->index = eb->start >> PAGE_CACHE_SHIFT; 547 + tm->slot = src_slot; 548 + tm->move.dst_slot = dst_slot; 549 + tm->move.nr_items = nr_items; 550 + tm->op = MOD_LOG_MOVE_KEYS; 551 + 552 + return __tree_mod_log_insert(fs_info, tm); 553 + } 554 + 555 + static noinline int 556 + tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, 557 + struct extent_buffer *old_root, 558 + struct extent_buffer *new_root, gfp_t flags) 559 + { 560 + struct tree_mod_elem *tm; 561 + int ret; 562 + 563 + ret = tree_mod_alloc(fs_info, flags, &tm); 564 + if (ret <= 0) 565 + return ret; 566 + 567 + tm->index = new_root->start >> PAGE_CACHE_SHIFT; 568 + tm->old_root.logical = old_root->start; 569 + tm->old_root.level = btrfs_header_level(old_root); 570 + tm->generation = btrfs_header_generation(old_root); 571 + tm->op = MOD_LOG_ROOT_REPLACE; 572 + 573 + return __tree_mod_log_insert(fs_info, tm); 574 + } 575 + 576 + static struct tree_mod_elem * 577 + __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, 578 + int smallest) 579 + { 580 + struct rb_root *tm_root; 581 + struct rb_node *node; 582 + struct tree_mod_elem *cur = NULL; 583 + struct tree_mod_elem *found = NULL; 584 + u64 index = start >> PAGE_CACHE_SHIFT; 585 + 586 + read_lock(&fs_info->tree_mod_log_lock); 587 + tm_root = &fs_info->tree_mod_log; 588 + node = tm_root->rb_node; 589 + while (node) { 590 + cur = container_of(node, struct tree_mod_elem, node); 591 + if (cur->index < index) { 592 + node = node->rb_left; 593 + } else if (cur->index > index) { 594 + node = node->rb_right; 595 + } else if (cur->elem.seq < min_seq) { 596 + node = node->rb_left; 597 + } else if (!smallest) { 598 + /* we want the node with the highest seq */ 599 + if (found) 600 + BUG_ON(found->elem.seq > cur->elem.seq); 601 + found = cur; 602 + node = node->rb_left; 603 + } else if (cur->elem.seq > min_seq) { 604 + /* we want the node with the smallest seq */ 605 + if (found) 606 + BUG_ON(found->elem.seq < cur->elem.seq); 607 + found = cur; 608 + node = node->rb_right; 609 + } else { 610 + found = cur; 611 + break; 612 + } 613 + } 614 + read_unlock(&fs_info->tree_mod_log_lock); 615 + 616 + return found; 617 + } 618 + 619 + /* 620 + * this returns the element from the log with the smallest time sequence 621 + * value that's in the log (the oldest log item). any element with a time 622 + * sequence lower than min_seq will be ignored. 623 + */ 624 + static struct tree_mod_elem * 625 + tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, 626 + u64 min_seq) 627 + { 628 + return __tree_mod_log_search(fs_info, start, min_seq, 1); 629 + } 630 + 631 + /* 632 + * this returns the element from the log with the largest time sequence 633 + * value that's in the log (the most recent log item). any element with 634 + * a time sequence lower than min_seq will be ignored. 635 + */ 636 + static struct tree_mod_elem * 637 + tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) 638 + { 639 + return __tree_mod_log_search(fs_info, start, min_seq, 0); 640 + } 641 + 642 + static inline void 643 + tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 644 + struct extent_buffer *src, unsigned long dst_offset, 645 + unsigned long src_offset, int nr_items) 646 + { 647 + int ret; 648 + int i; 649 + 650 + if (tree_mod_dont_log(fs_info, NULL)) 651 + return; 652 + 653 + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 654 + return; 655 + 656 + /* speed this up by single seq for all operations? */ 657 + for (i = 0; i < nr_items; i++) { 658 + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset, 659 + MOD_LOG_KEY_REMOVE); 660 + BUG_ON(ret < 0); 661 + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset, 662 + MOD_LOG_KEY_ADD); 663 + BUG_ON(ret < 0); 664 + } 665 + } 666 + 667 + static inline void 668 + tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 669 + int dst_offset, int src_offset, int nr_items) 670 + { 671 + int ret; 672 + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, 673 + nr_items, GFP_NOFS); 674 + BUG_ON(ret < 0); 675 + } 676 + 677 + static inline void 678 + tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 679 + struct extent_buffer *eb, 680 + struct btrfs_disk_key *disk_key, int slot, int atomic) 681 + { 682 + int ret; 683 + 684 + ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, 685 + MOD_LOG_KEY_REPLACE, 686 + atomic ? GFP_ATOMIC : GFP_NOFS); 687 + BUG_ON(ret < 0); 688 + } 689 + 690 + static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 691 + struct extent_buffer *eb) 692 + { 693 + int i; 694 + int ret; 695 + u32 nritems; 696 + 697 + if (tree_mod_dont_log(fs_info, eb)) 698 + return; 699 + 700 + nritems = btrfs_header_nritems(eb); 701 + for (i = nritems - 1; i >= 0; i--) { 702 + ret = tree_mod_log_insert_key(fs_info, eb, i, 703 + MOD_LOG_KEY_REMOVE_WHILE_FREEING); 704 + BUG_ON(ret < 0); 705 + } 706 + } 707 + 708 + static inline void 709 + tree_mod_log_set_root_pointer(struct btrfs_root *root, 710 + struct extent_buffer *new_root_node) 711 + { 712 + int ret; 713 + tree_mod_log_free_eb(root->fs_info, root->node); 714 + ret = tree_mod_log_insert_root(root->fs_info, root->node, 715 + new_root_node, GFP_NOFS); 716 + BUG_ON(ret < 0); 299 717 } 300 718 301 719 /* ··· 847 409 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 848 410 BUG_ON(ret); /* -ENOMEM */ 849 411 } 412 + /* 413 + * don't log freeing in case we're freeing the root node, this 414 + * is done by tree_mod_log_set_root_pointer later 415 + */ 416 + if (buf != root->node && btrfs_header_level(buf) != 0) 417 + tree_mod_log_free_eb(root->fs_info, buf); 850 418 clean_tree_block(trans, root, buf); 851 419 *last_ref = 1; 852 420 } ··· 911 467 912 468 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 913 469 root->root_key.objectid, &disk_key, 914 - level, search_start, empty_size, 1); 470 + level, search_start, empty_size); 915 471 if (IS_ERR(cow)) 916 472 return PTR_ERR(cow); 917 473 ··· 950 506 parent_start = 0; 951 507 952 508 extent_buffer_get(cow); 509 + tree_mod_log_set_root_pointer(root, cow); 953 510 rcu_assign_pointer(root->node, cow); 954 511 955 512 btrfs_free_tree_block(trans, root, buf, parent_start, 956 - last_ref, 1); 513 + last_ref); 957 514 free_extent_buffer(buf); 958 515 add_root_to_dirty_list(root); 959 516 } else { ··· 964 519 parent_start = 0; 965 520 966 521 WARN_ON(trans->transid != btrfs_header_generation(parent)); 522 + tree_mod_log_insert_key(root->fs_info, parent, parent_slot, 523 + MOD_LOG_KEY_REPLACE); 967 524 btrfs_set_node_blockptr(parent, parent_slot, 968 525 cow->start); 969 526 btrfs_set_node_ptr_generation(parent, parent_slot, 970 527 trans->transid); 971 528 btrfs_mark_buffer_dirty(parent); 972 529 btrfs_free_tree_block(trans, root, buf, parent_start, 973 - last_ref, 1); 530 + last_ref); 974 531 } 975 532 if (unlock_orig) 976 533 btrfs_tree_unlock(buf); ··· 980 533 btrfs_mark_buffer_dirty(cow); 981 534 *cow_ret = cow; 982 535 return 0; 536 + } 537 + 538 + /* 539 + * returns the logical address of the oldest predecessor of the given root. 540 + * entries older than time_seq are ignored. 541 + */ 542 + static struct tree_mod_elem * 543 + __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, 544 + struct btrfs_root *root, u64 time_seq) 545 + { 546 + struct tree_mod_elem *tm; 547 + struct tree_mod_elem *found = NULL; 548 + u64 root_logical = root->node->start; 549 + int looped = 0; 550 + 551 + if (!time_seq) 552 + return 0; 553 + 554 + /* 555 + * the very last operation that's logged for a root is the replacement 556 + * operation (if it is replaced at all). this has the index of the *new* 557 + * root, making it the very first operation that's logged for this root. 558 + */ 559 + while (1) { 560 + tm = tree_mod_log_search_oldest(fs_info, root_logical, 561 + time_seq); 562 + if (!looped && !tm) 563 + return 0; 564 + /* 565 + * we must have key remove operations in the log before the 566 + * replace operation. 567 + */ 568 + BUG_ON(!tm); 569 + 570 + if (tm->op != MOD_LOG_ROOT_REPLACE) 571 + break; 572 + 573 + found = tm; 574 + root_logical = tm->old_root.logical; 575 + BUG_ON(root_logical == root->node->start); 576 + looped = 1; 577 + } 578 + 579 + return found; 580 + } 581 + 582 + /* 583 + * tm is a pointer to the first operation to rewind within eb. then, all 584 + * previous operations will be rewinded (until we reach something older than 585 + * time_seq). 586 + */ 587 + static void 588 + __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 589 + struct tree_mod_elem *first_tm) 590 + { 591 + u32 n; 592 + struct rb_node *next; 593 + struct tree_mod_elem *tm = first_tm; 594 + unsigned long o_dst; 595 + unsigned long o_src; 596 + unsigned long p_size = sizeof(struct btrfs_key_ptr); 597 + 598 + n = btrfs_header_nritems(eb); 599 + while (tm && tm->elem.seq >= time_seq) { 600 + /* 601 + * all the operations are recorded with the operator used for 602 + * the modification. as we're going backwards, we do the 603 + * opposite of each operation here. 604 + */ 605 + switch (tm->op) { 606 + case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 607 + BUG_ON(tm->slot < n); 608 + case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 609 + case MOD_LOG_KEY_REMOVE: 610 + btrfs_set_node_key(eb, &tm->key, tm->slot); 611 + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 612 + btrfs_set_node_ptr_generation(eb, tm->slot, 613 + tm->generation); 614 + n++; 615 + break; 616 + case MOD_LOG_KEY_REPLACE: 617 + BUG_ON(tm->slot >= n); 618 + btrfs_set_node_key(eb, &tm->key, tm->slot); 619 + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 620 + btrfs_set_node_ptr_generation(eb, tm->slot, 621 + tm->generation); 622 + break; 623 + case MOD_LOG_KEY_ADD: 624 + if (tm->slot != n - 1) { 625 + o_dst = btrfs_node_key_ptr_offset(tm->slot); 626 + o_src = btrfs_node_key_ptr_offset(tm->slot + 1); 627 + memmove_extent_buffer(eb, o_dst, o_src, p_size); 628 + } 629 + n--; 630 + break; 631 + case MOD_LOG_MOVE_KEYS: 632 + o_dst = btrfs_node_key_ptr_offset(tm->slot); 633 + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 634 + memmove_extent_buffer(eb, o_dst, o_src, 635 + tm->move.nr_items * p_size); 636 + break; 637 + case MOD_LOG_ROOT_REPLACE: 638 + /* 639 + * this operation is special. for roots, this must be 640 + * handled explicitly before rewinding. 641 + * for non-roots, this operation may exist if the node 642 + * was a root: root A -> child B; then A gets empty and 643 + * B is promoted to the new root. in the mod log, we'll 644 + * have a root-replace operation for B, a tree block 645 + * that is no root. we simply ignore that operation. 646 + */ 647 + break; 648 + } 649 + next = rb_next(&tm->node); 650 + if (!next) 651 + break; 652 + tm = container_of(next, struct tree_mod_elem, node); 653 + if (tm->index != first_tm->index) 654 + break; 655 + } 656 + btrfs_set_header_nritems(eb, n); 657 + } 658 + 659 + static struct extent_buffer * 660 + tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, 661 + u64 time_seq) 662 + { 663 + struct extent_buffer *eb_rewin; 664 + struct tree_mod_elem *tm; 665 + 666 + if (!time_seq) 667 + return eb; 668 + 669 + if (btrfs_header_level(eb) == 0) 670 + return eb; 671 + 672 + tm = tree_mod_log_search(fs_info, eb->start, time_seq); 673 + if (!tm) 674 + return eb; 675 + 676 + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 677 + BUG_ON(tm->slot != 0); 678 + eb_rewin = alloc_dummy_extent_buffer(eb->start, 679 + fs_info->tree_root->nodesize); 680 + BUG_ON(!eb_rewin); 681 + btrfs_set_header_bytenr(eb_rewin, eb->start); 682 + btrfs_set_header_backref_rev(eb_rewin, 683 + btrfs_header_backref_rev(eb)); 684 + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 685 + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 686 + } else { 687 + eb_rewin = btrfs_clone_extent_buffer(eb); 688 + BUG_ON(!eb_rewin); 689 + } 690 + 691 + extent_buffer_get(eb_rewin); 692 + free_extent_buffer(eb); 693 + 694 + __tree_mod_log_rewind(eb_rewin, time_seq, tm); 695 + 696 + return eb_rewin; 697 + } 698 + 699 + static inline struct extent_buffer * 700 + get_old_root(struct btrfs_root *root, u64 time_seq) 701 + { 702 + struct tree_mod_elem *tm; 703 + struct extent_buffer *eb; 704 + struct tree_mod_root *old_root; 705 + u64 old_generation; 706 + 707 + tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); 708 + if (!tm) 709 + return root->node; 710 + 711 + old_root = &tm->old_root; 712 + old_generation = tm->generation; 713 + 714 + tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); 715 + /* 716 + * there was an item in the log when __tree_mod_log_oldest_root 717 + * returned. this one must not go away, because the time_seq passed to 718 + * us must be blocking its removal. 719 + */ 720 + BUG_ON(!tm); 721 + 722 + if (old_root->logical == root->node->start) { 723 + /* there are logged operations for the current root */ 724 + eb = btrfs_clone_extent_buffer(root->node); 725 + } else { 726 + /* there's a root replace operation for the current root */ 727 + eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, 728 + root->nodesize); 729 + btrfs_set_header_bytenr(eb, eb->start); 730 + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 731 + btrfs_set_header_owner(eb, root->root_key.objectid); 732 + } 733 + if (!eb) 734 + return NULL; 735 + btrfs_set_header_level(eb, old_root->level); 736 + btrfs_set_header_generation(eb, old_generation); 737 + __tree_mod_log_rewind(eb, time_seq, tm); 738 + 739 + return eb; 983 740 } 984 741 985 742 static inline int should_cow_block(struct btrfs_trans_handle *trans, ··· 1390 739 if (!cur) 1391 740 return -EIO; 1392 741 } else if (!uptodate) { 1393 - btrfs_read_buffer(cur, gen); 742 + err = btrfs_read_buffer(cur, gen); 743 + if (err) { 744 + free_extent_buffer(cur); 745 + return err; 746 + } 1394 747 } 1395 748 } 1396 749 if (search_start == 0) ··· 1509 854 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, 1510 855 int level, int *slot) 1511 856 { 1512 - if (level == 0) { 857 + if (level == 0) 1513 858 return generic_bin_search(eb, 1514 859 offsetof(struct btrfs_leaf, items), 1515 860 sizeof(struct btrfs_item), 1516 861 key, btrfs_header_nritems(eb), 1517 862 slot); 1518 - } else { 863 + else 1519 864 return generic_bin_search(eb, 1520 865 offsetof(struct btrfs_node, ptrs), 1521 866 sizeof(struct btrfs_key_ptr), 1522 867 key, btrfs_header_nritems(eb), 1523 868 slot); 1524 - } 1525 - return -1; 1526 869 } 1527 870 1528 871 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, ··· 1627 974 goto enospc; 1628 975 } 1629 976 977 + tree_mod_log_set_root_pointer(root, child); 1630 978 rcu_assign_pointer(root->node, child); 1631 979 1632 980 add_root_to_dirty_list(root); ··· 1641 987 free_extent_buffer(mid); 1642 988 1643 989 root_sub_used(root, mid->len); 1644 - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 990 + btrfs_free_tree_block(trans, root, mid, 0, 1); 1645 991 /* once for the root ptr */ 1646 992 free_extent_buffer_stale(mid); 1647 993 return 0; ··· 1694 1040 if (btrfs_header_nritems(right) == 0) { 1695 1041 clean_tree_block(trans, root, right); 1696 1042 btrfs_tree_unlock(right); 1697 - del_ptr(trans, root, path, level + 1, pslot + 1); 1043 + del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1698 1044 root_sub_used(root, right->len); 1699 - btrfs_free_tree_block(trans, root, right, 0, 1, 0); 1045 + btrfs_free_tree_block(trans, root, right, 0, 1); 1700 1046 free_extent_buffer_stale(right); 1701 1047 right = NULL; 1702 1048 } else { 1703 1049 struct btrfs_disk_key right_key; 1704 1050 btrfs_node_key(right, &right_key, 0); 1051 + tree_mod_log_set_node_key(root->fs_info, parent, 1052 + &right_key, pslot + 1, 0); 1705 1053 btrfs_set_node_key(parent, &right_key, pslot + 1); 1706 1054 btrfs_mark_buffer_dirty(parent); 1707 1055 } ··· 1738 1082 if (btrfs_header_nritems(mid) == 0) { 1739 1083 clean_tree_block(trans, root, mid); 1740 1084 btrfs_tree_unlock(mid); 1741 - del_ptr(trans, root, path, level + 1, pslot); 1085 + del_ptr(trans, root, path, level + 1, pslot, 1); 1742 1086 root_sub_used(root, mid->len); 1743 - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); 1087 + btrfs_free_tree_block(trans, root, mid, 0, 1); 1744 1088 free_extent_buffer_stale(mid); 1745 1089 mid = NULL; 1746 1090 } else { 1747 1091 /* update the parent key to reflect our changes */ 1748 1092 struct btrfs_disk_key mid_key; 1749 1093 btrfs_node_key(mid, &mid_key, 0); 1094 + tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1095 + pslot, 0); 1750 1096 btrfs_set_node_key(parent, &mid_key, pslot); 1751 1097 btrfs_mark_buffer_dirty(parent); 1752 1098 } ··· 1846 1188 struct btrfs_disk_key disk_key; 1847 1189 orig_slot += left_nr; 1848 1190 btrfs_node_key(mid, &disk_key, 0); 1191 + tree_mod_log_set_node_key(root->fs_info, parent, 1192 + &disk_key, pslot, 0); 1849 1193 btrfs_set_node_key(parent, &disk_key, pslot); 1850 1194 btrfs_mark_buffer_dirty(parent); 1851 1195 if (btrfs_header_nritems(left) > orig_slot) { ··· 1899 1239 struct btrfs_disk_key disk_key; 1900 1240 1901 1241 btrfs_node_key(right, &disk_key, 0); 1242 + tree_mod_log_set_node_key(root->fs_info, parent, 1243 + &disk_key, pslot + 1, 0); 1902 1244 btrfs_set_node_key(parent, &disk_key, pslot + 1); 1903 1245 btrfs_mark_buffer_dirty(parent); 1904 1246 ··· 2158 1496 read_block_for_search(struct btrfs_trans_handle *trans, 2159 1497 struct btrfs_root *root, struct btrfs_path *p, 2160 1498 struct extent_buffer **eb_ret, int level, int slot, 2161 - struct btrfs_key *key) 1499 + struct btrfs_key *key, u64 time_seq) 2162 1500 { 2163 1501 u64 blocknr; 2164 1502 u64 gen; ··· 2512 1850 } 2513 1851 2514 1852 err = read_block_for_search(trans, root, p, 2515 - &b, level, slot, key); 1853 + &b, level, slot, key, 0); 2516 1854 if (err == -EAGAIN) 2517 1855 goto again; 2518 1856 if (err) { ··· 2584 1922 } 2585 1923 2586 1924 /* 1925 + * Like btrfs_search_slot, this looks for a key in the given tree. It uses the 1926 + * current state of the tree together with the operations recorded in the tree 1927 + * modification log to search for the key in a previous version of this tree, as 1928 + * denoted by the time_seq parameter. 1929 + * 1930 + * Naturally, there is no support for insert, delete or cow operations. 1931 + * 1932 + * The resulting path and return value will be set up as if we called 1933 + * btrfs_search_slot at that point in time with ins_len and cow both set to 0. 1934 + */ 1935 + int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 1936 + struct btrfs_path *p, u64 time_seq) 1937 + { 1938 + struct extent_buffer *b; 1939 + int slot; 1940 + int ret; 1941 + int err; 1942 + int level; 1943 + int lowest_unlock = 1; 1944 + u8 lowest_level = 0; 1945 + 1946 + lowest_level = p->lowest_level; 1947 + WARN_ON(p->nodes[0] != NULL); 1948 + 1949 + if (p->search_commit_root) { 1950 + BUG_ON(time_seq); 1951 + return btrfs_search_slot(NULL, root, key, p, 0, 0); 1952 + } 1953 + 1954 + again: 1955 + b = get_old_root(root, time_seq); 1956 + extent_buffer_get(b); 1957 + level = btrfs_header_level(b); 1958 + btrfs_tree_read_lock(b); 1959 + p->locks[level] = BTRFS_READ_LOCK; 1960 + 1961 + while (b) { 1962 + level = btrfs_header_level(b); 1963 + p->nodes[level] = b; 1964 + btrfs_clear_path_blocking(p, NULL, 0); 1965 + 1966 + /* 1967 + * we have a lock on b and as long as we aren't changing 1968 + * the tree, there is no way to for the items in b to change. 1969 + * It is safe to drop the lock on our parent before we 1970 + * go through the expensive btree search on b. 1971 + */ 1972 + btrfs_unlock_up_safe(p, level + 1); 1973 + 1974 + ret = bin_search(b, key, level, &slot); 1975 + 1976 + if (level != 0) { 1977 + int dec = 0; 1978 + if (ret && slot > 0) { 1979 + dec = 1; 1980 + slot -= 1; 1981 + } 1982 + p->slots[level] = slot; 1983 + unlock_up(p, level, lowest_unlock, 0, NULL); 1984 + 1985 + if (level == lowest_level) { 1986 + if (dec) 1987 + p->slots[level]++; 1988 + goto done; 1989 + } 1990 + 1991 + err = read_block_for_search(NULL, root, p, &b, level, 1992 + slot, key, time_seq); 1993 + if (err == -EAGAIN) 1994 + goto again; 1995 + if (err) { 1996 + ret = err; 1997 + goto done; 1998 + } 1999 + 2000 + level = btrfs_header_level(b); 2001 + err = btrfs_try_tree_read_lock(b); 2002 + if (!err) { 2003 + btrfs_set_path_blocking(p); 2004 + btrfs_tree_read_lock(b); 2005 + btrfs_clear_path_blocking(p, b, 2006 + BTRFS_READ_LOCK); 2007 + } 2008 + p->locks[level] = BTRFS_READ_LOCK; 2009 + p->nodes[level] = b; 2010 + b = tree_mod_log_rewind(root->fs_info, b, time_seq); 2011 + if (b != p->nodes[level]) { 2012 + btrfs_tree_unlock_rw(p->nodes[level], 2013 + p->locks[level]); 2014 + p->locks[level] = 0; 2015 + p->nodes[level] = b; 2016 + } 2017 + } else { 2018 + p->slots[level] = slot; 2019 + unlock_up(p, level, lowest_unlock, 0, NULL); 2020 + goto done; 2021 + } 2022 + } 2023 + ret = 1; 2024 + done: 2025 + if (!p->leave_spinning) 2026 + btrfs_set_path_blocking(p); 2027 + if (ret < 0) 2028 + btrfs_release_path(p); 2029 + 2030 + return ret; 2031 + } 2032 + 2033 + /* 2587 2034 * adjust the pointers going up the tree, starting at level 2588 2035 * making sure the right key of each node is points to 'key'. 2589 2036 * This is used after shifting pointers to the left, so it stops ··· 2712 1941 if (!path->nodes[i]) 2713 1942 break; 2714 1943 t = path->nodes[i]; 1944 + tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2715 1945 btrfs_set_node_key(t, key, tslot); 2716 1946 btrfs_mark_buffer_dirty(path->nodes[i]); 2717 1947 if (tslot != 0) ··· 2795 2023 } else 2796 2024 push_items = min(src_nritems - 8, push_items); 2797 2025 2026 + tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, 2027 + push_items); 2798 2028 copy_extent_buffer(dst, src, 2799 2029 btrfs_node_key_ptr_offset(dst_nritems), 2800 2030 btrfs_node_key_ptr_offset(0), 2801 2031 push_items * sizeof(struct btrfs_key_ptr)); 2802 2032 2803 2033 if (push_items < src_nritems) { 2034 + tree_mod_log_eb_move(root->fs_info, src, 0, push_items, 2035 + src_nritems - push_items); 2804 2036 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), 2805 2037 btrfs_node_key_ptr_offset(push_items), 2806 2038 (src_nritems - push_items) * ··· 2858 2082 if (max_push < push_items) 2859 2083 push_items = max_push; 2860 2084 2085 + tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); 2861 2086 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), 2862 2087 btrfs_node_key_ptr_offset(0), 2863 2088 (dst_nritems) * 2864 2089 sizeof(struct btrfs_key_ptr)); 2865 2090 2091 + tree_mod_log_eb_copy(root->fs_info, dst, src, 0, 2092 + src_nritems - push_items, push_items); 2866 2093 copy_extent_buffer(dst, src, 2867 2094 btrfs_node_key_ptr_offset(0), 2868 2095 btrfs_node_key_ptr_offset(src_nritems - push_items), ··· 2908 2129 2909 2130 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2910 2131 root->root_key.objectid, &lower_key, 2911 - level, root->node->start, 0, 0); 2132 + level, root->node->start, 0); 2912 2133 if (IS_ERR(c)) 2913 2134 return PTR_ERR(c); 2914 2135 ··· 2940 2161 btrfs_mark_buffer_dirty(c); 2941 2162 2942 2163 old = root->node; 2164 + tree_mod_log_set_root_pointer(root, c); 2943 2165 rcu_assign_pointer(root->node, c); 2944 2166 2945 2167 /* the super has an extra ref to root->node */ ··· 2964 2184 static void insert_ptr(struct btrfs_trans_handle *trans, 2965 2185 struct btrfs_root *root, struct btrfs_path *path, 2966 2186 struct btrfs_disk_key *key, u64 bytenr, 2967 - int slot, int level) 2187 + int slot, int level, int tree_mod_log) 2968 2188 { 2969 2189 struct extent_buffer *lower; 2970 2190 int nritems; 2191 + int ret; 2971 2192 2972 2193 BUG_ON(!path->nodes[level]); 2973 2194 btrfs_assert_tree_locked(path->nodes[level]); ··· 2977 2196 BUG_ON(slot > nritems); 2978 2197 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); 2979 2198 if (slot != nritems) { 2199 + if (tree_mod_log && level) 2200 + tree_mod_log_eb_move(root->fs_info, lower, slot + 1, 2201 + slot, nritems - slot); 2980 2202 memmove_extent_buffer(lower, 2981 2203 btrfs_node_key_ptr_offset(slot + 1), 2982 2204 btrfs_node_key_ptr_offset(slot), 2983 2205 (nritems - slot) * sizeof(struct btrfs_key_ptr)); 2206 + } 2207 + if (tree_mod_log && level) { 2208 + ret = tree_mod_log_insert_key(root->fs_info, lower, slot, 2209 + MOD_LOG_KEY_ADD); 2210 + BUG_ON(ret < 0); 2984 2211 } 2985 2212 btrfs_set_node_key(lower, key, slot); 2986 2213 btrfs_set_node_blockptr(lower, slot, bytenr); ··· 3041 2252 3042 2253 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3043 2254 root->root_key.objectid, 3044 - &disk_key, level, c->start, 0, 0); 2255 + &disk_key, level, c->start, 0); 3045 2256 if (IS_ERR(split)) 3046 2257 return PTR_ERR(split); 3047 2258 ··· 3060 2271 (unsigned long)btrfs_header_chunk_tree_uuid(split), 3061 2272 BTRFS_UUID_SIZE); 3062 2273 3063 - 2274 + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); 3064 2275 copy_extent_buffer(split, c, 3065 2276 btrfs_node_key_ptr_offset(0), 3066 2277 btrfs_node_key_ptr_offset(mid), ··· 3073 2284 btrfs_mark_buffer_dirty(split); 3074 2285 3075 2286 insert_ptr(trans, root, path, &disk_key, split->start, 3076 - path->slots[level + 1] + 1, level + 1); 2287 + path->slots[level + 1] + 1, level + 1, 1); 3077 2288 3078 2289 if (path->slots[level] >= mid) { 3079 2290 path->slots[level] -= mid; ··· 3610 2821 btrfs_set_header_nritems(l, mid); 3611 2822 btrfs_item_key(right, &disk_key, 0); 3612 2823 insert_ptr(trans, root, path, &disk_key, right->start, 3613 - path->slots[1] + 1, 1); 2824 + path->slots[1] + 1, 1, 0); 3614 2825 3615 2826 btrfs_mark_buffer_dirty(right); 3616 2827 btrfs_mark_buffer_dirty(l); ··· 3793 3004 3794 3005 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3795 3006 root->root_key.objectid, 3796 - &disk_key, 0, l->start, 0, 0); 3007 + &disk_key, 0, l->start, 0); 3797 3008 if (IS_ERR(right)) 3798 3009 return PTR_ERR(right); 3799 3010 ··· 3817 3028 if (mid <= slot) { 3818 3029 btrfs_set_header_nritems(right, 0); 3819 3030 insert_ptr(trans, root, path, &disk_key, right->start, 3820 - path->slots[1] + 1, 1); 3031 + path->slots[1] + 1, 1, 0); 3821 3032 btrfs_tree_unlock(path->nodes[0]); 3822 3033 free_extent_buffer(path->nodes[0]); 3823 3034 path->nodes[0] = right; ··· 3826 3037 } else { 3827 3038 btrfs_set_header_nritems(right, 0); 3828 3039 insert_ptr(trans, root, path, &disk_key, right->start, 3829 - path->slots[1], 1); 3040 + path->slots[1], 1, 0); 3830 3041 btrfs_tree_unlock(path->nodes[0]); 3831 3042 free_extent_buffer(path->nodes[0]); 3832 3043 path->nodes[0] = right; ··· 4538 3749 * empty a node. 4539 3750 */ 4540 3751 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4541 - struct btrfs_path *path, int level, int slot) 3752 + struct btrfs_path *path, int level, int slot, 3753 + int tree_mod_log) 4542 3754 { 4543 3755 struct extent_buffer *parent = path->nodes[level]; 4544 3756 u32 nritems; 3757 + int ret; 4545 3758 4546 3759 nritems = btrfs_header_nritems(parent); 4547 3760 if (slot != nritems - 1) { 3761 + if (tree_mod_log && level) 3762 + tree_mod_log_eb_move(root->fs_info, parent, slot, 3763 + slot + 1, nritems - slot - 1); 4548 3764 memmove_extent_buffer(parent, 4549 3765 btrfs_node_key_ptr_offset(slot), 4550 3766 btrfs_node_key_ptr_offset(slot + 1), 4551 3767 sizeof(struct btrfs_key_ptr) * 4552 3768 (nritems - slot - 1)); 3769 + } else if (tree_mod_log && level) { 3770 + ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 3771 + MOD_LOG_KEY_REMOVE); 3772 + BUG_ON(ret < 0); 4553 3773 } 3774 + 4554 3775 nritems--; 4555 3776 btrfs_set_header_nritems(parent, nritems); 4556 3777 if (nritems == 0 && parent == root->node) { ··· 4592 3793 struct extent_buffer *leaf) 4593 3794 { 4594 3795 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4595 - del_ptr(trans, root, path, 1, path->slots[1]); 3796 + del_ptr(trans, root, path, 1, path->slots[1], 1); 4596 3797 4597 3798 /* 4598 3799 * btrfs_free_extent is expensive, we want to make sure we ··· 4603 3804 root_sub_used(root, leaf->len); 4604 3805 4605 3806 extent_buffer_get(leaf); 4606 - btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); 3807 + btrfs_free_tree_block(trans, root, leaf, 0, 1); 4607 3808 free_extent_buffer_stale(leaf); 4608 3809 } 4609 3810 /* ··· 5070 4271 next = c; 5071 4272 next_rw_lock = path->locks[level]; 5072 4273 ret = read_block_for_search(NULL, root, path, &next, level, 5073 - slot, &key); 4274 + slot, &key, 0); 5074 4275 if (ret == -EAGAIN) 5075 4276 goto again; 5076 4277 ··· 5107 4308 break; 5108 4309 5109 4310 ret = read_block_for_search(NULL, root, path, &next, level, 5110 - 0, &key); 4311 + 0, &key, 0); 5111 4312 if (ret == -EAGAIN) 5112 4313 goto again; 5113 4314
+74 -3
fs/btrfs/ctree.h
··· 173 173 #define BTRFS_FT_XATTR 8 174 174 #define BTRFS_FT_MAX 9 175 175 176 + /* ioprio of readahead is set to idle */ 177 + #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) 178 + 176 179 /* 177 180 * The key defines the order in the tree, and so it also defines (optimal) 178 181 * block layout. ··· 826 823 u8 csum; 827 824 } __attribute__ ((__packed__)); 828 825 826 + struct btrfs_dev_stats_item { 827 + /* 828 + * grow this item struct at the end for future enhancements and keep 829 + * the existing values unchanged 830 + */ 831 + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 832 + } __attribute__ ((__packed__)); 833 + 829 834 /* different types of block groups (and chunks) */ 830 835 #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 831 836 #define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) ··· 1140 1129 spinlock_t delayed_iput_lock; 1141 1130 struct list_head delayed_iputs; 1142 1131 1132 + /* this protects tree_mod_seq_list */ 1133 + spinlock_t tree_mod_seq_lock; 1134 + atomic_t tree_mod_seq; 1135 + struct list_head tree_mod_seq_list; 1136 + 1137 + /* this protects tree_mod_log */ 1138 + rwlock_t tree_mod_log_lock; 1139 + struct rb_root tree_mod_log; 1140 + 1143 1141 atomic_t nr_async_submits; 1144 1142 atomic_t async_submit_draining; 1145 1143 atomic_t nr_async_bios; ··· 1395 1375 struct list_head root_list; 1396 1376 1397 1377 spinlock_t orphan_lock; 1398 - struct list_head orphan_list; 1378 + atomic_t orphan_inodes; 1399 1379 struct btrfs_block_rsv *orphan_block_rsv; 1400 1380 int orphan_item_inserted; 1401 1381 int orphan_cleanup_state; ··· 1526 1506 #define BTRFS_CHUNK_ITEM_KEY 228 1527 1507 1528 1508 #define BTRFS_BALANCE_ITEM_KEY 248 1509 + 1510 + /* 1511 + * Persistantly stores the io stats in the device tree. 1512 + * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid). 1513 + */ 1514 + #define BTRFS_DEV_STATS_KEY 249 1529 1515 1530 1516 /* 1531 1517 * string items are for debugging. They just store a short string of ··· 2441 2415 return btrfs_item_size(eb, e) - offset; 2442 2416 } 2443 2417 2418 + /* btrfs_dev_stats_item */ 2419 + static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, 2420 + struct btrfs_dev_stats_item *ptr, 2421 + int index) 2422 + { 2423 + u64 val; 2424 + 2425 + read_extent_buffer(eb, &val, 2426 + offsetof(struct btrfs_dev_stats_item, values) + 2427 + ((unsigned long)ptr) + (index * sizeof(u64)), 2428 + sizeof(val)); 2429 + return val; 2430 + } 2431 + 2432 + static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, 2433 + struct btrfs_dev_stats_item *ptr, 2434 + int index, u64 val) 2435 + { 2436 + write_extent_buffer(eb, &val, 2437 + offsetof(struct btrfs_dev_stats_item, values) + 2438 + ((unsigned long)ptr) + (index * sizeof(u64)), 2439 + sizeof(val)); 2440 + } 2441 + 2444 2442 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2445 2443 { 2446 2444 return sb->s_fs_info; ··· 2546 2496 struct btrfs_root *root, u32 blocksize, 2547 2497 u64 parent, u64 root_objectid, 2548 2498 struct btrfs_disk_key *key, int level, 2549 - u64 hint, u64 empty_size, int for_cow); 2499 + u64 hint, u64 empty_size); 2550 2500 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2551 2501 struct btrfs_root *root, 2552 2502 struct extent_buffer *buf, 2553 - u64 parent, int last_ref, int for_cow); 2503 + u64 parent, int last_ref); 2554 2504 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2555 2505 struct btrfs_root *root, 2556 2506 u64 bytenr, u32 blocksize, ··· 2709 2659 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 2710 2660 *root, struct btrfs_key *key, struct btrfs_path *p, int 2711 2661 ins_len, int cow); 2662 + int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, 2663 + struct btrfs_path *p, u64 time_seq); 2712 2664 int btrfs_realloc_node(struct btrfs_trans_handle *trans, 2713 2665 struct btrfs_root *root, struct extent_buffer *parent, 2714 2666 int start_slot, int cache_only, u64 *last_ret, ··· 3150 3098 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, 3151 3099 u64 start, int err); 3152 3100 3101 + /* delayed seq elem */ 3102 + struct seq_list { 3103 + struct list_head list; 3104 + u64 seq; 3105 + u32 flags; 3106 + }; 3107 + 3108 + void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 3109 + struct seq_list *elem); 3110 + void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 3111 + struct seq_list *elem); 3112 + 3113 + static inline int is_fstree(u64 rootid) 3114 + { 3115 + if (rootid == BTRFS_FS_TREE_OBJECTID || 3116 + (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) 3117 + return 1; 3118 + return 0; 3119 + } 3153 3120 #endif
+4 -4
fs/btrfs/delayed-inode.c
··· 669 669 return ret; 670 670 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 671 671 spin_lock(&BTRFS_I(inode)->lock); 672 - if (BTRFS_I(inode)->delalloc_meta_reserved) { 673 - BTRFS_I(inode)->delalloc_meta_reserved = 0; 672 + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 673 + &BTRFS_I(inode)->runtime_flags)) { 674 674 spin_unlock(&BTRFS_I(inode)->lock); 675 675 release = true; 676 676 goto migrate; ··· 1706 1706 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); 1707 1707 btrfs_set_stack_inode_generation(inode_item, 1708 1708 BTRFS_I(inode)->generation); 1709 - btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); 1709 + btrfs_set_stack_inode_sequence(inode_item, inode->i_version); 1710 1710 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1711 1711 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1712 1712 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); ··· 1754 1754 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1755 1755 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1756 1756 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1757 - BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); 1757 + inode->i_version = btrfs_stack_inode_sequence(inode_item); 1758 1758 inode->i_rdev = 0; 1759 1759 *rdev = btrfs_stack_inode_rdev(inode_item); 1760 1760 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+6 -4
fs/btrfs/delayed-ref.c
··· 525 525 ref->is_head = 0; 526 526 ref->in_tree = 1; 527 527 528 - if (need_ref_seq(for_cow, ref_root)) 528 + if (is_fstree(ref_root)) 529 529 seq = inc_delayed_seq(delayed_refs); 530 530 ref->seq = seq; 531 531 ··· 584 584 ref->is_head = 0; 585 585 ref->in_tree = 1; 586 586 587 - if (need_ref_seq(for_cow, ref_root)) 587 + if (is_fstree(ref_root)) 588 588 seq = inc_delayed_seq(delayed_refs); 589 589 ref->seq = seq; 590 590 ··· 658 658 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 659 659 num_bytes, parent, ref_root, level, action, 660 660 for_cow); 661 - if (!need_ref_seq(for_cow, ref_root) && 661 + if (!is_fstree(ref_root) && 662 662 waitqueue_active(&delayed_refs->seq_wait)) 663 663 wake_up(&delayed_refs->seq_wait); 664 664 spin_unlock(&delayed_refs->lock); 665 + 665 666 return 0; 666 667 } 667 668 ··· 707 706 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 708 707 num_bytes, parent, ref_root, owner, offset, 709 708 action, for_cow); 710 - if (!need_ref_seq(for_cow, ref_root) && 709 + if (!is_fstree(ref_root) && 711 710 waitqueue_active(&delayed_refs->seq_wait)) 712 711 wake_up(&delayed_refs->seq_wait); 713 712 spin_unlock(&delayed_refs->lock); 713 + 714 714 return 0; 715 715 } 716 716
-24
fs/btrfs/delayed-ref.h
··· 195 195 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 196 196 struct list_head *cluster, u64 search_start); 197 197 198 - struct seq_list { 199 - struct list_head list; 200 - u64 seq; 201 - }; 202 - 203 198 static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) 204 199 { 205 200 assert_spin_locked(&delayed_refs->lock); ··· 223 228 224 229 int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, 225 230 u64 seq); 226 - 227 - /* 228 - * delayed refs with a ref_seq > 0 must be held back during backref walking. 229 - * this only applies to items in one of the fs-trees. for_cow items never need 230 - * to be held back, so they won't get a ref_seq number. 231 - */ 232 - static inline int need_ref_seq(int for_cow, u64 rootid) 233 - { 234 - if (for_cow) 235 - return 0; 236 - 237 - if (rootid == BTRFS_FS_TREE_OBJECTID) 238 - return 1; 239 - 240 - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) 241 - return 1; 242 - 243 - return 0; 244 - } 245 231 246 232 /* 247 233 * a node might live in a head or a regular ref, this lets you
+25 -32
fs/btrfs/disk-io.c
··· 1153 1153 root->orphan_block_rsv = NULL; 1154 1154 1155 1155 INIT_LIST_HEAD(&root->dirty_list); 1156 - INIT_LIST_HEAD(&root->orphan_list); 1157 1156 INIT_LIST_HEAD(&root->root_list); 1158 1157 spin_lock_init(&root->orphan_lock); 1159 1158 spin_lock_init(&root->inode_lock); ··· 1165 1166 atomic_set(&root->log_commit[0], 0); 1166 1167 atomic_set(&root->log_commit[1], 0); 1167 1168 atomic_set(&root->log_writers, 0); 1169 + atomic_set(&root->orphan_inodes, 0); 1168 1170 root->log_batch = 0; 1169 1171 root->log_transid = 0; 1170 1172 root->last_log_commit = 0; ··· 1252 1252 1253 1253 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1254 1254 BTRFS_TREE_LOG_OBJECTID, NULL, 1255 - 0, 0, 0, 0); 1255 + 0, 0, 0); 1256 1256 if (IS_ERR(leaf)) { 1257 1257 kfree(root); 1258 1258 return ERR_CAST(leaf); ··· 1914 1914 spin_lock_init(&fs_info->delayed_iput_lock); 1915 1915 spin_lock_init(&fs_info->defrag_inodes_lock); 1916 1916 spin_lock_init(&fs_info->free_chunk_lock); 1917 + spin_lock_init(&fs_info->tree_mod_seq_lock); 1918 + rwlock_init(&fs_info->tree_mod_log_lock); 1917 1919 mutex_init(&fs_info->reloc_mutex); 1918 1920 1919 1921 init_completion(&fs_info->kobj_unregister); 1920 1922 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1921 1923 INIT_LIST_HEAD(&fs_info->space_info); 1924 + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 1922 1925 btrfs_mapping_init(&fs_info->mapping_tree); 1923 1926 btrfs_init_block_rsv(&fs_info->global_block_rsv); 1924 1927 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); ··· 1934 1931 atomic_set(&fs_info->async_submit_draining, 0); 1935 1932 atomic_set(&fs_info->nr_async_bios, 0); 1936 1933 atomic_set(&fs_info->defrag_running, 0); 1934 + atomic_set(&fs_info->tree_mod_seq, 0); 1937 1935 fs_info->sb = sb; 1938 1936 fs_info->max_inline = 8192 * 1024; 1939 1937 fs_info->metadata_ratio = 0; 1940 1938 fs_info->defrag_inodes = RB_ROOT; 1941 1939 fs_info->trans_no_join = 0; 1942 1940 fs_info->free_chunk_space = 0; 1941 + fs_info->tree_mod_log = RB_ROOT; 1943 1942 1944 1943 /* readahead state */ 1945 1944 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); ··· 2006 2001 BTRFS_I(fs_info->btree_inode)->root = tree_root; 2007 2002 memset(&BTRFS_I(fs_info->btree_inode)->location, 0, 2008 2003 sizeof(struct btrfs_key)); 2009 - BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; 2004 + set_bit(BTRFS_INODE_DUMMY, 2005 + &BTRFS_I(fs_info->btree_inode)->runtime_flags); 2010 2006 insert_inode_hash(fs_info->btree_inode); 2011 2007 2012 2008 spin_lock_init(&fs_info->block_group_cache_lock); ··· 2359 2353 fs_info->generation = generation; 2360 2354 fs_info->last_trans_committed = generation; 2361 2355 2356 + ret = btrfs_init_dev_stats(fs_info); 2357 + if (ret) { 2358 + printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", 2359 + ret); 2360 + goto fail_block_groups; 2361 + } 2362 + 2362 2363 ret = btrfs_init_space_info(fs_info); 2363 2364 if (ret) { 2364 2365 printk(KERN_ERR "Failed to initial space info: %d\n", ret); ··· 2569 2556 2570 2557 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2571 2558 { 2572 - char b[BDEVNAME_SIZE]; 2573 - 2574 2559 if (uptodate) { 2575 2560 set_buffer_uptodate(bh); 2576 2561 } else { 2562 + struct btrfs_device *device = (struct btrfs_device *) 2563 + bh->b_private; 2564 + 2577 2565 printk_ratelimited(KERN_WARNING "lost page write due to " 2578 - "I/O error on %s\n", 2579 - bdevname(bh->b_bdev, b)); 2566 + "I/O error on %s\n", device->name); 2580 2567 /* note, we dont' set_buffer_write_io_error because we have 2581 2568 * our own ways of dealing with the IO errors 2582 2569 */ 2583 2570 clear_buffer_uptodate(bh); 2571 + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); 2584 2572 } 2585 2573 unlock_buffer(bh); 2586 2574 put_bh(bh); ··· 2696 2682 set_buffer_uptodate(bh); 2697 2683 lock_buffer(bh); 2698 2684 bh->b_end_io = btrfs_end_buffer_write_sync; 2685 + bh->b_private = device; 2699 2686 } 2700 2687 2701 2688 /* ··· 2755 2740 } 2756 2741 if (!bio_flagged(bio, BIO_UPTODATE)) { 2757 2742 ret = -EIO; 2743 + if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2744 + btrfs_dev_stat_inc_and_print(device, 2745 + BTRFS_DEV_STAT_FLUSH_ERRS); 2758 2746 } 2759 2747 2760 2748 /* drop the reference from the wait == 0 run */ ··· 2918 2900 2919 2901 ret = write_all_supers(root, max_mirrors); 2920 2902 return ret; 2921 - } 2922 - 2923 - /* Kill all outstanding I/O */ 2924 - void btrfs_abort_devices(struct btrfs_root *root) 2925 - { 2926 - struct list_head *head; 2927 - struct btrfs_device *dev; 2928 - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2929 - head = &root->fs_info->fs_devices->devices; 2930 - list_for_each_entry_rcu(dev, head, dev_list) { 2931 - blk_abort_queue(dev->bdev->bd_disk->queue); 2932 - } 2933 - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2934 2903 } 2935 2904 2936 2905 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) ··· 3676 3671 return 0; 3677 3672 } 3678 3673 3679 - static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, 3680 - u64 start, u64 end, 3681 - struct extent_state *state) 3682 - { 3683 - struct super_block *sb = page->mapping->host->i_sb; 3684 - struct btrfs_fs_info *fs_info = btrfs_sb(sb); 3685 - btrfs_error(fs_info, -EIO, 3686 - "Error occured while writing out btree at %llu", start); 3687 - return -EIO; 3688 - } 3689 - 3690 3674 static struct extent_io_ops btree_extent_io_ops = { 3691 3675 .write_cache_pages_lock_hook = btree_lock_page_hook, 3692 3676 .readpage_end_io_hook = btree_readpage_end_io_hook, ··· 3683 3689 .submit_bio_hook = btree_submit_bio_hook, 3684 3690 /* note we're sharing with inode.c for the merge bio hook */ 3685 3691 .merge_bio_hook = btrfs_merge_bio_hook, 3686 - .writepage_io_failed_hook = btree_writepage_io_failed_hook, 3687 3692 };
-1
fs/btrfs/disk-io.h
··· 89 89 int btrfs_cleanup_transaction(struct btrfs_root *root); 90 90 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, 91 91 struct btrfs_root *root); 92 - void btrfs_abort_devices(struct btrfs_root *root); 93 92 94 93 #ifdef CONFIG_DEBUG_LOCK_ALLOC 95 94 void btrfs_init_lockdep(void);
+12 -11
fs/btrfs/extent-tree.c
··· 3578 3578 space_info->chunk_alloc = 0; 3579 3579 spin_unlock(&space_info->lock); 3580 3580 out: 3581 - mutex_unlock(&extent_root->fs_info->chunk_mutex); 3581 + mutex_unlock(&fs_info->chunk_mutex); 3582 3582 return ret; 3583 3583 } 3584 3584 ··· 4355 4355 BTRFS_I(inode)->outstanding_extents--; 4356 4356 4357 4357 if (BTRFS_I(inode)->outstanding_extents == 0 && 4358 - BTRFS_I(inode)->delalloc_meta_reserved) { 4358 + test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4359 + &BTRFS_I(inode)->runtime_flags)) 4359 4360 drop_inode_space = 1; 4360 - BTRFS_I(inode)->delalloc_meta_reserved = 0; 4361 - } 4362 4361 4363 4362 /* 4364 4363 * If we have more or the same amount of outsanding extents than we have ··· 4464 4465 * Add an item to reserve for updating the inode when we complete the 4465 4466 * delalloc io. 4466 4467 */ 4467 - if (!BTRFS_I(inode)->delalloc_meta_reserved) { 4468 + if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4469 + &BTRFS_I(inode)->runtime_flags)) { 4468 4470 nr_extents++; 4469 4471 extra_reserve = 1; 4470 4472 } ··· 4511 4511 4512 4512 spin_lock(&BTRFS_I(inode)->lock); 4513 4513 if (extra_reserve) { 4514 - BTRFS_I(inode)->delalloc_meta_reserved = 1; 4514 + set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 4515 + &BTRFS_I(inode)->runtime_flags); 4515 4516 nr_extents--; 4516 4517 } 4517 4518 BTRFS_I(inode)->reserved_extents += nr_extents; ··· 5218 5217 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5219 5218 struct btrfs_root *root, 5220 5219 struct extent_buffer *buf, 5221 - u64 parent, int last_ref, int for_cow) 5220 + u64 parent, int last_ref) 5222 5221 { 5223 5222 struct btrfs_block_group_cache *cache = NULL; 5224 5223 int ret; ··· 5228 5227 buf->start, buf->len, 5229 5228 parent, root->root_key.objectid, 5230 5229 btrfs_header_level(buf), 5231 - BTRFS_DROP_DELAYED_REF, NULL, for_cow); 5230 + BTRFS_DROP_DELAYED_REF, NULL, 0); 5232 5231 BUG_ON(ret); /* -ENOMEM */ 5233 5232 } 5234 5233 ··· 6250 6249 struct btrfs_root *root, u32 blocksize, 6251 6250 u64 parent, u64 root_objectid, 6252 6251 struct btrfs_disk_key *key, int level, 6253 - u64 hint, u64 empty_size, int for_cow) 6252 + u64 hint, u64 empty_size) 6254 6253 { 6255 6254 struct btrfs_key ins; 6256 6255 struct btrfs_block_rsv *block_rsv; ··· 6298 6297 ins.objectid, 6299 6298 ins.offset, parent, root_objectid, 6300 6299 level, BTRFS_ADD_DELAYED_EXTENT, 6301 - extent_op, for_cow); 6300 + extent_op, 0); 6302 6301 BUG_ON(ret); /* -ENOMEM */ 6303 6302 } 6304 6303 return buf; ··· 6716 6715 btrfs_header_owner(path->nodes[level + 1])); 6717 6716 } 6718 6717 6719 - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); 6718 + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6720 6719 out: 6721 6720 wc->refs[level] = 0; 6722 6721 wc->flags[level] = 0;
+115 -53
fs/btrfs/extent_io.c
··· 186 186 return parent; 187 187 } 188 188 189 - entry = rb_entry(node, struct tree_entry, rb_node); 190 189 rb_link_node(node, parent, p); 191 190 rb_insert_color(node, root); 192 191 return NULL; ··· 412 413 413 414 /* 414 415 * utility function to clear some bits in an extent state struct. 415 - * it will optionally wake up any one waiting on this state (wake == 1) 416 + * it will optionally wake up any one waiting on this state (wake == 1). 416 417 * 417 418 * If no bits are set on the state struct after clearing things, the 418 419 * struct is freed and removed from the tree ··· 569 570 if (err) 570 571 goto out; 571 572 if (state->end <= end) { 572 - clear_state_bit(tree, state, &bits, wake); 573 - if (last_end == (u64)-1) 574 - goto out; 575 - start = last_end + 1; 573 + state = clear_state_bit(tree, state, &bits, wake); 574 + goto next; 576 575 } 577 576 goto search_again; 578 577 } ··· 778 781 * Just lock what we found and keep going 779 782 */ 780 783 if (state->start == start && state->end <= end) { 781 - struct rb_node *next_node; 782 784 if (state->state & exclusive_bits) { 783 785 *failed_start = state->start; 784 786 err = -EEXIST; ··· 785 789 } 786 790 787 791 set_state_bits(tree, state, &bits); 788 - 789 792 cache_state(state, cached_state); 790 793 merge_state(tree, state); 791 794 if (last_end == (u64)-1) 792 795 goto out; 793 - 794 796 start = last_end + 1; 795 - next_node = rb_next(&state->rb_node); 796 - if (next_node && start < end && prealloc && !need_resched()) { 797 - state = rb_entry(next_node, struct extent_state, 798 - rb_node); 799 - if (state->start == start) 800 - goto hit_next; 801 - } 797 + state = next_state(state); 798 + if (start < end && state && state->start == start && 799 + !need_resched()) 800 + goto hit_next; 802 801 goto search_again; 803 802 } 804 803 ··· 836 845 if (last_end == (u64)-1) 837 846 goto out; 838 847 start = last_end + 1; 848 + state = next_state(state); 849 + if (start < end && state && state->start == start && 850 + !need_resched()) 851 + goto hit_next; 839 852 } 840 853 goto search_again; 841 854 } ··· 989 994 * Just lock what we found and keep going 990 995 */ 991 996 if (state->start == start && state->end <= end) { 992 - struct rb_node *next_node; 993 - 994 997 set_state_bits(tree, state, &bits); 995 - clear_state_bit(tree, state, &clear_bits, 0); 998 + state = clear_state_bit(tree, state, &clear_bits, 0); 996 999 if (last_end == (u64)-1) 997 1000 goto out; 998 - 999 1001 start = last_end + 1; 1000 - next_node = rb_next(&state->rb_node); 1001 - if (next_node && start < end && prealloc && !need_resched()) { 1002 - state = rb_entry(next_node, struct extent_state, 1003 - rb_node); 1004 - if (state->start == start) 1005 - goto hit_next; 1006 - } 1002 + if (start < end && state && state->start == start && 1003 + !need_resched()) 1004 + goto hit_next; 1007 1005 goto search_again; 1008 1006 } 1009 1007 ··· 1030 1042 goto out; 1031 1043 if (state->end <= end) { 1032 1044 set_state_bits(tree, state, &bits); 1033 - clear_state_bit(tree, state, &clear_bits, 0); 1045 + state = clear_state_bit(tree, state, &clear_bits, 0); 1034 1046 if (last_end == (u64)-1) 1035 1047 goto out; 1036 1048 start = last_end + 1; 1049 + if (start < end && state && state->start == start && 1050 + !need_resched()) 1051 + goto hit_next; 1037 1052 } 1038 1053 goto search_again; 1039 1054 } ··· 1164 1173 cached_state, mask); 1165 1174 } 1166 1175 1167 - static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 1168 - u64 end, struct extent_state **cached_state, 1169 - gfp_t mask) 1176 + int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1177 + struct extent_state **cached_state, gfp_t mask) 1170 1178 { 1171 1179 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1172 1180 cached_state, mask); ··· 1283 1293 * returned if we find something, and *start_ret and *end_ret are 1284 1294 * set to reflect the state struct that was found. 1285 1295 * 1286 - * If nothing was found, 1 is returned, < 0 on error 1296 + * If nothing was found, 1 is returned. If found something, return 0. 1287 1297 */ 1288 1298 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1289 1299 u64 *start_ret, u64 *end_ret, int bits) ··· 1913 1923 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1914 1924 /* try to remap that extent elsewhere? */ 1915 1925 bio_put(bio); 1926 + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 1916 1927 return -EIO; 1917 1928 } 1918 1929 ··· 2213 2222 uptodate = 0; 2214 2223 } 2215 2224 2216 - if (!uptodate && tree->ops && 2217 - tree->ops->writepage_io_failed_hook) { 2218 - ret = tree->ops->writepage_io_failed_hook(NULL, page, 2219 - start, end, NULL); 2220 - /* Writeback already completed */ 2221 - if (ret == 0) 2222 - return 1; 2223 - } 2224 - 2225 2225 if (!uptodate) { 2226 - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 2227 2226 ClearPageUptodate(page); 2228 2227 SetPageError(page); 2229 2228 } ··· 2328 2347 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2329 2348 ret = tree->ops->readpage_end_io_hook(page, start, end, 2330 2349 state, mirror); 2331 - if (ret) 2350 + if (ret) { 2351 + /* no IO indicated but software detected errors 2352 + * in the block, either checksum errors or 2353 + * issues with the contents */ 2354 + struct btrfs_root *root = 2355 + BTRFS_I(page->mapping->host)->root; 2356 + struct btrfs_device *device; 2357 + 2332 2358 uptodate = 0; 2333 - else 2359 + device = btrfs_find_device_for_logical( 2360 + root, start, mirror); 2361 + if (device) 2362 + btrfs_dev_stat_inc_and_print(device, 2363 + BTRFS_DEV_STAT_CORRUPTION_ERRS); 2364 + } else { 2334 2365 clean_io_failure(start, page); 2366 + } 2335 2367 } 2336 2368 2337 2369 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { ··· 3158 3164 u64 offset = eb->start; 3159 3165 unsigned long i, num_pages; 3160 3166 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3161 - int ret; 3167 + int ret = 0; 3162 3168 3163 3169 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3164 3170 num_pages = num_extent_pages(eb->start, eb->len); ··· 3924 3930 eb->start = start; 3925 3931 eb->len = len; 3926 3932 eb->tree = tree; 3933 + eb->bflags = 0; 3927 3934 rwlock_init(&eb->lock); 3928 3935 atomic_set(&eb->write_locks, 0); 3929 3936 atomic_set(&eb->read_locks, 0); ··· 3962 3967 return eb; 3963 3968 } 3964 3969 3970 + struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 3971 + { 3972 + unsigned long i; 3973 + struct page *p; 3974 + struct extent_buffer *new; 3975 + unsigned long num_pages = num_extent_pages(src->start, src->len); 3976 + 3977 + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); 3978 + if (new == NULL) 3979 + return NULL; 3980 + 3981 + for (i = 0; i < num_pages; i++) { 3982 + p = alloc_page(GFP_ATOMIC); 3983 + BUG_ON(!p); 3984 + attach_extent_buffer_page(new, p); 3985 + WARN_ON(PageDirty(p)); 3986 + SetPageUptodate(p); 3987 + new->pages[i] = p; 3988 + } 3989 + 3990 + copy_extent_buffer(new, src, 0, 0, src->len); 3991 + set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 3992 + set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 3993 + 3994 + return new; 3995 + } 3996 + 3997 + struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 3998 + { 3999 + struct extent_buffer *eb; 4000 + unsigned long num_pages = num_extent_pages(0, len); 4001 + unsigned long i; 4002 + 4003 + eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); 4004 + if (!eb) 4005 + return NULL; 4006 + 4007 + for (i = 0; i < num_pages; i++) { 4008 + eb->pages[i] = alloc_page(GFP_ATOMIC); 4009 + if (!eb->pages[i]) 4010 + goto err; 4011 + } 4012 + set_extent_buffer_uptodate(eb); 4013 + btrfs_set_header_nritems(eb, 0); 4014 + set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4015 + 4016 + return eb; 4017 + err: 4018 + for (i--; i > 0; i--) 4019 + __free_page(eb->pages[i]); 4020 + __free_extent_buffer(eb); 4021 + return NULL; 4022 + } 4023 + 3965 4024 static int extent_buffer_under_io(struct extent_buffer *eb) 3966 4025 { 3967 4026 return (atomic_read(&eb->io_pages) || ··· 4030 3981 unsigned long start_idx) 4031 3982 { 4032 3983 unsigned long index; 3984 + unsigned long num_pages; 4033 3985 struct page *page; 3986 + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4034 3987 4035 3988 BUG_ON(extent_buffer_under_io(eb)); 4036 3989 4037 - index = num_extent_pages(eb->start, eb->len); 3990 + num_pages = num_extent_pages(eb->start, eb->len); 3991 + index = start_idx + num_pages; 4038 3992 if (start_idx >= index) 4039 3993 return; 4040 3994 4041 3995 do { 4042 3996 index--; 4043 3997 page = extent_buffer_page(eb, index); 4044 - if (page) { 3998 + if (page && mapped) { 4045 3999 spin_lock(&page->mapping->private_lock); 4046 4000 /* 4047 4001 * We do this since we'll remove the pages after we've ··· 4069 4017 } 4070 4018 spin_unlock(&page->mapping->private_lock); 4071 4019 4020 + } 4021 + if (page) { 4072 4022 /* One for when we alloced the page */ 4073 4023 page_cache_release(page); 4074 4024 } ··· 4289 4235 { 4290 4236 WARN_ON(atomic_read(&eb->refs) == 0); 4291 4237 if (atomic_dec_and_test(&eb->refs)) { 4292 - struct extent_io_tree *tree = eb->tree; 4238 + if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4239 + spin_unlock(&eb->refs_lock); 4240 + } else { 4241 + struct extent_io_tree *tree = eb->tree; 4293 4242 4294 - spin_unlock(&eb->refs_lock); 4243 + spin_unlock(&eb->refs_lock); 4295 4244 4296 - spin_lock(&tree->buffer_lock); 4297 - radix_tree_delete(&tree->buffer, 4298 - eb->start >> PAGE_CACHE_SHIFT); 4299 - spin_unlock(&tree->buffer_lock); 4245 + spin_lock(&tree->buffer_lock); 4246 + radix_tree_delete(&tree->buffer, 4247 + eb->start >> PAGE_CACHE_SHIFT); 4248 + spin_unlock(&tree->buffer_lock); 4249 + } 4300 4250 4301 4251 /* Should be safe to release our pages at this point */ 4302 4252 btrfs_release_extent_buffer_page(eb, 0); ··· 4317 4259 return; 4318 4260 4319 4261 spin_lock(&eb->refs_lock); 4262 + if (atomic_read(&eb->refs) == 2 && 4263 + test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4264 + atomic_dec(&eb->refs); 4265 + 4320 4266 if (atomic_read(&eb->refs) == 2 && 4321 4267 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4322 4268 !extent_buffer_under_io(eb) &&
+5 -3
fs/btrfs/extent_io.h
··· 39 39 #define EXTENT_BUFFER_STALE 6 40 40 #define EXTENT_BUFFER_WRITEBACK 7 41 41 #define EXTENT_BUFFER_IOERR 8 42 + #define EXTENT_BUFFER_DUMMY 9 42 43 43 44 /* these are flags for extent_clear_unlock_delalloc */ 44 45 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 ··· 76 75 unsigned long bio_flags); 77 76 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 78 77 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 79 - int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 80 - u64 start, u64 end, 81 - struct extent_state *state); 82 78 int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, 83 79 struct extent_state *state, int mirror); 84 80 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, ··· 223 225 struct extent_state **cached_state, gfp_t mask); 224 226 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 225 227 struct extent_state **cached_state, gfp_t mask); 228 + int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 229 + struct extent_state **cached_state, gfp_t mask); 226 230 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 227 231 gfp_t mask); 228 232 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, ··· 265 265 266 266 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 267 267 u64 start, unsigned long len); 268 + struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); 269 + struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); 268 270 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 269 271 u64 start, unsigned long len); 270 272 void free_extent_buffer(struct extent_buffer *eb);
+53 -23
fs/btrfs/file.c
··· 65 65 int cycled; 66 66 }; 67 67 68 + static int __compare_inode_defrag(struct inode_defrag *defrag1, 69 + struct inode_defrag *defrag2) 70 + { 71 + if (defrag1->root > defrag2->root) 72 + return 1; 73 + else if (defrag1->root < defrag2->root) 74 + return -1; 75 + else if (defrag1->ino > defrag2->ino) 76 + return 1; 77 + else if (defrag1->ino < defrag2->ino) 78 + return -1; 79 + else 80 + return 0; 81 + } 82 + 68 83 /* pop a record for an inode into the defrag tree. The lock 69 84 * must be held already 70 85 * ··· 96 81 struct inode_defrag *entry; 97 82 struct rb_node **p; 98 83 struct rb_node *parent = NULL; 84 + int ret; 99 85 100 86 p = &root->fs_info->defrag_inodes.rb_node; 101 87 while (*p) { 102 88 parent = *p; 103 89 entry = rb_entry(parent, struct inode_defrag, rb_node); 104 90 105 - if (defrag->ino < entry->ino) 91 + ret = __compare_inode_defrag(defrag, entry); 92 + if (ret < 0) 106 93 p = &parent->rb_left; 107 - else if (defrag->ino > entry->ino) 94 + else if (ret > 0) 108 95 p = &parent->rb_right; 109 96 else { 110 97 /* if we're reinserting an entry for ··· 120 103 goto exists; 121 104 } 122 105 } 123 - BTRFS_I(inode)->in_defrag = 1; 106 + set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 124 107 rb_link_node(&defrag->rb_node, parent, p); 125 108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 126 109 return; ··· 148 131 if (btrfs_fs_closing(root->fs_info)) 149 132 return 0; 150 133 151 - if (BTRFS_I(inode)->in_defrag) 134 + if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 152 135 return 0; 153 136 154 137 if (trans) ··· 165 148 defrag->root = root->root_key.objectid; 166 149 167 150 spin_lock(&root->fs_info->defrag_inodes_lock); 168 - if (!BTRFS_I(inode)->in_defrag) 151 + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 169 152 __btrfs_add_inode_defrag(inode, defrag); 170 153 else 171 154 kfree(defrag); ··· 176 159 /* 177 160 * must be called with the defrag_inodes lock held 178 161 */ 179 - struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, 162 + struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 163 + u64 root, u64 ino, 180 164 struct rb_node **next) 181 165 { 182 166 struct inode_defrag *entry = NULL; 167 + struct inode_defrag tmp; 183 168 struct rb_node *p; 184 169 struct rb_node *parent = NULL; 170 + int ret; 171 + 172 + tmp.ino = ino; 173 + tmp.root = root; 185 174 186 175 p = info->defrag_inodes.rb_node; 187 176 while (p) { 188 177 parent = p; 189 178 entry = rb_entry(parent, struct inode_defrag, rb_node); 190 179 191 - if (ino < entry->ino) 180 + ret = __compare_inode_defrag(&tmp, entry); 181 + if (ret < 0) 192 182 p = parent->rb_left; 193 - else if (ino > entry->ino) 183 + else if (ret > 0) 194 184 p = parent->rb_right; 195 185 else 196 186 return entry; 197 187 } 198 188 199 189 if (next) { 200 - while (parent && ino > entry->ino) { 190 + while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 201 191 parent = rb_next(parent); 202 192 entry = rb_entry(parent, struct inode_defrag, rb_node); 203 193 } ··· 226 202 struct btrfs_key key; 227 203 struct btrfs_ioctl_defrag_range_args range; 228 204 u64 first_ino = 0; 205 + u64 root_objectid = 0; 229 206 int num_defrag; 230 207 int defrag_batch = 1024; 231 208 ··· 239 214 n = NULL; 240 215 241 216 /* find an inode to defrag */ 242 - defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); 217 + defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 218 + first_ino, &n); 243 219 if (!defrag) { 244 - if (n) 245 - defrag = rb_entry(n, struct inode_defrag, rb_node); 246 - else if (first_ino) { 220 + if (n) { 221 + defrag = rb_entry(n, struct inode_defrag, 222 + rb_node); 223 + } else if (root_objectid || first_ino) { 224 + root_objectid = 0; 247 225 first_ino = 0; 248 226 continue; 249 227 } else { ··· 256 228 257 229 /* remove it from the rbtree */ 258 230 first_ino = defrag->ino + 1; 231 + root_objectid = defrag->root; 259 232 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 260 233 261 234 if (btrfs_fs_closing(fs_info)) ··· 281 252 goto next; 282 253 283 254 /* do a chunk of defrag */ 284 - BTRFS_I(inode)->in_defrag = 0; 255 + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 285 256 range.start = defrag->last_offset; 286 257 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 287 258 defrag_batch); ··· 1438 1409 mutex_unlock(&inode->i_mutex); 1439 1410 goto out; 1440 1411 } 1441 - BTRFS_I(inode)->sequence++; 1442 1412 1443 1413 start_pos = round_down(pos, root->sectorsize); 1444 1414 if (start_pos > i_size_read(inode)) { ··· 1494 1466 * flush down new bytes that may have been written if the 1495 1467 * application were using truncate to replace a file in place. 1496 1468 */ 1497 - if (BTRFS_I(inode)->ordered_data_close) { 1498 - BTRFS_I(inode)->ordered_data_close = 0; 1469 + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1470 + &BTRFS_I(inode)->runtime_flags)) { 1499 1471 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); 1500 1472 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 1501 1473 filemap_flush(inode->i_mapping); ··· 1526 1498 1527 1499 trace_btrfs_sync_file(file, datasync); 1528 1500 1529 - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1530 - if (ret) 1531 - return ret; 1532 1501 mutex_lock(&inode->i_mutex); 1533 1502 1534 - /* we wait first, since the writeback may change the inode */ 1503 + /* 1504 + * we wait first, since the writeback may change the inode, also wait 1505 + * ordered range does a filemape_write_and_wait_range which is why we 1506 + * don't do it above like other file systems. 1507 + */ 1535 1508 root->log_batch++; 1536 - btrfs_wait_ordered_range(inode, 0, (u64)-1); 1509 + btrfs_wait_ordered_range(inode, start, end); 1537 1510 root->log_batch++; 1538 1511 1539 1512 /* ··· 1552 1523 * syncing 1553 1524 */ 1554 1525 smp_mb(); 1555 - if (BTRFS_I(inode)->last_trans <= 1526 + if (btrfs_inode_in_log(inode, root->fs_info->generation) || 1527 + BTRFS_I(inode)->last_trans <= 1556 1528 root->fs_info->last_trans_committed) { 1557 1529 BTRFS_I(inode)->last_trans = 0; 1558 1530 mutex_unlock(&inode->i_mutex);
+42 -3
fs/btrfs/free-space-cache.c
··· 33 33 34 34 static int link_free_space(struct btrfs_free_space_ctl *ctl, 35 35 struct btrfs_free_space *info); 36 + static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 37 + struct btrfs_free_space *info); 36 38 37 39 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 38 40 struct btrfs_path *path, ··· 586 584 return 0; 587 585 } 588 586 587 + /* 588 + * Since we attach pinned extents after the fact we can have contiguous sections 589 + * of free space that are split up in entries. This poses a problem with the 590 + * tree logging stuff since it could have allocated across what appears to be 2 591 + * entries since we would have merged the entries when adding the pinned extents 592 + * back to the free space cache. So run through the space cache that we just 593 + * loaded and merge contiguous entries. This will make the log replay stuff not 594 + * blow up and it will make for nicer allocator behavior. 595 + */ 596 + static void merge_space_tree(struct btrfs_free_space_ctl *ctl) 597 + { 598 + struct btrfs_free_space *e, *prev = NULL; 599 + struct rb_node *n; 600 + 601 + again: 602 + spin_lock(&ctl->tree_lock); 603 + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 604 + e = rb_entry(n, struct btrfs_free_space, offset_index); 605 + if (!prev) 606 + goto next; 607 + if (e->bitmap || prev->bitmap) 608 + goto next; 609 + if (prev->offset + prev->bytes == e->offset) { 610 + unlink_free_space(ctl, prev); 611 + unlink_free_space(ctl, e); 612 + prev->bytes += e->bytes; 613 + kmem_cache_free(btrfs_free_space_cachep, e); 614 + link_free_space(ctl, prev); 615 + prev = NULL; 616 + spin_unlock(&ctl->tree_lock); 617 + goto again; 618 + } 619 + next: 620 + prev = e; 621 + } 622 + spin_unlock(&ctl->tree_lock); 623 + } 624 + 589 625 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 590 626 struct btrfs_free_space_ctl *ctl, 591 627 struct btrfs_path *path, u64 offset) ··· 766 726 } 767 727 768 728 io_ctl_drop_pages(&io_ctl); 729 + merge_space_tree(ctl); 769 730 ret = 1; 770 731 out: 771 732 io_ctl_free(&io_ctl); ··· 1013 972 goto out; 1014 973 1015 974 1016 - ret = filemap_write_and_wait(inode->i_mapping); 1017 - if (ret) 1018 - goto out; 975 + btrfs_wait_ordered_range(inode, 0, (u64)-1); 1019 976 1020 977 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1021 978 key.offset = offset;
+126 -138
fs/btrfs/inode.c
··· 89 89 90 90 static int btrfs_setsize(struct inode *inode, loff_t newsize); 91 91 static int btrfs_truncate(struct inode *inode); 92 - static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 92 + static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93 93 static noinline int cow_file_range(struct inode *inode, 94 94 struct page *locked_page, 95 95 u64 start, u64 end, int *page_started, ··· 257 257 ret = insert_inline_extent(trans, root, inode, start, 258 258 inline_len, compressed_size, 259 259 compress_type, compressed_pages); 260 - if (ret) { 260 + if (ret && ret != -ENOSPC) { 261 261 btrfs_abort_transaction(trans, root, ret); 262 262 return ret; 263 + } else if (ret == -ENOSPC) { 264 + return 1; 263 265 } 266 + 264 267 btrfs_delalloc_release_metadata(inode, end + 1 - start); 265 268 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 266 269 return 0; ··· 1575 1572 if (btrfs_is_free_space_inode(root, inode)) 1576 1573 metadata = 2; 1577 1574 1578 - ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1579 - if (ret) 1580 - return ret; 1581 - 1582 1575 if (!(rw & REQ_WRITE)) { 1576 + ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1577 + if (ret) 1578 + return ret; 1579 + 1583 1580 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1584 1581 return btrfs_submit_compressed_read(inode, bio, 1585 1582 mirror_num, bio_flags); ··· 1818 1815 * an ordered extent if the range of bytes in the file it covers are 1819 1816 * fully written. 1820 1817 */ 1821 - static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1818 + static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) 1822 1819 { 1820 + struct inode *inode = ordered_extent->inode; 1823 1821 struct btrfs_root *root = BTRFS_I(inode)->root; 1824 1822 struct btrfs_trans_handle *trans = NULL; 1825 - struct btrfs_ordered_extent *ordered_extent = NULL; 1826 1823 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1827 1824 struct extent_state *cached_state = NULL; 1828 1825 int compress_type = 0; 1829 1826 int ret; 1830 1827 bool nolock; 1831 1828 1832 - ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1833 - end - start + 1); 1834 - if (!ret) 1835 - return 0; 1836 - BUG_ON(!ordered_extent); /* Logic error */ 1837 - 1838 1829 nolock = btrfs_is_free_space_inode(root, inode); 1830 + 1831 + if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { 1832 + ret = -EIO; 1833 + goto out; 1834 + } 1839 1835 1840 1836 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1841 1837 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ ··· 1891 1889 ordered_extent->file_offset, 1892 1890 ordered_extent->len); 1893 1891 } 1894 - unlock_extent_cached(io_tree, ordered_extent->file_offset, 1895 - ordered_extent->file_offset + 1896 - ordered_extent->len - 1, &cached_state, GFP_NOFS); 1892 + 1897 1893 if (ret < 0) { 1898 1894 btrfs_abort_transaction(trans, root, ret); 1899 - goto out; 1895 + goto out_unlock; 1900 1896 } 1901 1897 1902 1898 add_pending_csums(trans, inode, ordered_extent->file_offset, ··· 1905 1905 ret = btrfs_update_inode_fallback(trans, root, inode); 1906 1906 if (ret) { /* -ENOMEM or corruption */ 1907 1907 btrfs_abort_transaction(trans, root, ret); 1908 - goto out; 1908 + goto out_unlock; 1909 1909 } 1910 1910 } 1911 1911 ret = 0; 1912 + out_unlock: 1913 + unlock_extent_cached(io_tree, ordered_extent->file_offset, 1914 + ordered_extent->file_offset + 1915 + ordered_extent->len - 1, &cached_state, GFP_NOFS); 1912 1916 out: 1913 1917 if (root != root->fs_info->tree_root) 1914 1918 btrfs_delalloc_release_metadata(inode, ordered_extent->len); ··· 1923 1919 btrfs_end_transaction(trans, root); 1924 1920 } 1925 1921 1922 + if (ret) 1923 + clear_extent_uptodate(io_tree, ordered_extent->file_offset, 1924 + ordered_extent->file_offset + 1925 + ordered_extent->len - 1, NULL, GFP_NOFS); 1926 + 1927 + /* 1928 + * This needs to be dont to make sure anybody waiting knows we are done 1929 + * upating everything for this ordered extent. 1930 + */ 1931 + btrfs_remove_ordered_extent(inode, ordered_extent); 1932 + 1926 1933 /* once for us */ 1927 1934 btrfs_put_ordered_extent(ordered_extent); 1928 1935 /* once for the tree */ 1929 1936 btrfs_put_ordered_extent(ordered_extent); 1930 1937 1931 - return 0; 1932 - out_unlock: 1933 - unlock_extent_cached(io_tree, ordered_extent->file_offset, 1934 - ordered_extent->file_offset + 1935 - ordered_extent->len - 1, &cached_state, GFP_NOFS); 1936 - goto out; 1938 + return ret; 1939 + } 1940 + 1941 + static void finish_ordered_fn(struct btrfs_work *work) 1942 + { 1943 + struct btrfs_ordered_extent *ordered_extent; 1944 + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 1945 + btrfs_finish_ordered_io(ordered_extent); 1937 1946 } 1938 1947 1939 1948 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1940 1949 struct extent_state *state, int uptodate) 1941 1950 { 1951 + struct inode *inode = page->mapping->host; 1952 + struct btrfs_root *root = BTRFS_I(inode)->root; 1953 + struct btrfs_ordered_extent *ordered_extent = NULL; 1954 + struct btrfs_workers *workers; 1955 + 1942 1956 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 1943 1957 1944 1958 ClearPagePrivate2(page); 1945 - return btrfs_finish_ordered_io(page->mapping->host, start, end); 1959 + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1960 + end - start + 1, uptodate)) 1961 + return 0; 1962 + 1963 + ordered_extent->work.func = finish_ordered_fn; 1964 + ordered_extent->work.flags = 0; 1965 + 1966 + if (btrfs_is_free_space_inode(root, inode)) 1967 + workers = &root->fs_info->endio_freespace_worker; 1968 + else 1969 + workers = &root->fs_info->endio_write_workers; 1970 + btrfs_queue_worker(workers, &ordered_extent->work); 1971 + 1972 + return 0; 1946 1973 } 1947 1974 1948 1975 /* ··· 2107 2072 struct btrfs_block_rsv *block_rsv; 2108 2073 int ret; 2109 2074 2110 - if (!list_empty(&root->orphan_list) || 2075 + if (atomic_read(&root->orphan_inodes) || 2111 2076 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 2112 2077 return; 2113 2078 2114 2079 spin_lock(&root->orphan_lock); 2115 - if (!list_empty(&root->orphan_list)) { 2080 + if (atomic_read(&root->orphan_inodes)) { 2116 2081 spin_unlock(&root->orphan_lock); 2117 2082 return; 2118 2083 } ··· 2169 2134 block_rsv = NULL; 2170 2135 } 2171 2136 2172 - if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2173 - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2137 + if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2138 + &BTRFS_I(inode)->runtime_flags)) { 2174 2139 #if 0 2175 2140 /* 2176 2141 * For proper ENOSPC handling, we should do orphan ··· 2183 2148 insert = 1; 2184 2149 #endif 2185 2150 insert = 1; 2151 + atomic_dec(&root->orphan_inodes); 2186 2152 } 2187 2153 2188 - if (!BTRFS_I(inode)->orphan_meta_reserved) { 2189 - BTRFS_I(inode)->orphan_meta_reserved = 1; 2154 + if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2155 + &BTRFS_I(inode)->runtime_flags)) 2190 2156 reserve = 1; 2191 - } 2192 2157 spin_unlock(&root->orphan_lock); 2193 2158 2194 2159 /* grab metadata reservation from transaction handle */ ··· 2201 2166 if (insert >= 1) { 2202 2167 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); 2203 2168 if (ret && ret != -EEXIST) { 2169 + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2170 + &BTRFS_I(inode)->runtime_flags); 2204 2171 btrfs_abort_transaction(trans, root, ret); 2205 2172 return ret; 2206 2173 } ··· 2233 2196 int ret = 0; 2234 2197 2235 2198 spin_lock(&root->orphan_lock); 2236 - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2237 - list_del_init(&BTRFS_I(inode)->i_orphan); 2199 + if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2200 + &BTRFS_I(inode)->runtime_flags)) 2238 2201 delete_item = 1; 2239 - } 2240 2202 2241 - if (BTRFS_I(inode)->orphan_meta_reserved) { 2242 - BTRFS_I(inode)->orphan_meta_reserved = 0; 2203 + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, 2204 + &BTRFS_I(inode)->runtime_flags)) 2243 2205 release_rsv = 1; 2244 - } 2245 2206 spin_unlock(&root->orphan_lock); 2246 2207 2247 2208 if (trans && delete_item) { ··· 2247 2212 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ 2248 2213 } 2249 2214 2250 - if (release_rsv) 2215 + if (release_rsv) { 2251 2216 btrfs_orphan_release_metadata(inode); 2217 + atomic_dec(&root->orphan_inodes); 2218 + } 2252 2219 2253 2220 return 0; 2254 2221 } ··· 2378 2341 ret = PTR_ERR(trans); 2379 2342 goto out; 2380 2343 } 2344 + printk(KERN_ERR "auto deleting %Lu\n", 2345 + found_key.objectid); 2381 2346 ret = btrfs_del_orphan_item(trans, root, 2382 2347 found_key.objectid); 2383 2348 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ ··· 2391 2352 * add this inode to the orphan list so btrfs_orphan_del does 2392 2353 * the proper thing when we hit it 2393 2354 */ 2394 - spin_lock(&root->orphan_lock); 2395 - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2396 - spin_unlock(&root->orphan_lock); 2355 + set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 2356 + &BTRFS_I(inode)->runtime_flags); 2397 2357 2398 2358 /* if we have links, this was a truncate, lets do that */ 2399 2359 if (inode->i_nlink) { ··· 2548 2510 2549 2511 inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); 2550 2512 BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); 2551 - BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); 2513 + inode->i_version = btrfs_inode_sequence(leaf, inode_item); 2552 2514 inode->i_generation = BTRFS_I(inode)->generation; 2553 2515 inode->i_rdev = 0; 2554 2516 rdev = btrfs_inode_rdev(leaf, inode_item); ··· 2632 2594 2633 2595 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); 2634 2596 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); 2635 - btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); 2597 + btrfs_set_inode_sequence(leaf, item, inode->i_version); 2636 2598 btrfs_set_inode_transid(leaf, item, trans->transid); 2637 2599 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2638 2600 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); ··· 2790 2752 goto out; 2791 2753 2792 2754 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2755 + inode_inc_iversion(inode); 2756 + inode_inc_iversion(dir); 2793 2757 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2794 2758 btrfs_update_inode(trans, root, dir); 2795 2759 out: ··· 3129 3089 } 3130 3090 3131 3091 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3092 + inode_inc_iversion(dir); 3132 3093 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3133 3094 ret = btrfs_update_inode(trans, root, dir); 3134 3095 if (ret) ··· 3648 3607 * any new writes get down to disk quickly. 3649 3608 */ 3650 3609 if (newsize == 0) 3651 - BTRFS_I(inode)->ordered_data_close = 1; 3610 + set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3611 + &BTRFS_I(inode)->runtime_flags); 3652 3612 3653 3613 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3654 3614 truncate_setsize(inode, newsize); ··· 3680 3638 3681 3639 if (attr->ia_valid) { 3682 3640 setattr_copy(inode, attr); 3641 + inode_inc_iversion(inode); 3683 3642 err = btrfs_dirty_inode(inode); 3684 3643 3685 3644 if (!err && attr->ia_valid & ATTR_MODE) ··· 3714 3671 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3715 3672 3716 3673 if (root->fs_info->log_root_recovering) { 3717 - BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); 3674 + BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 3675 + &BTRFS_I(inode)->runtime_flags)); 3718 3676 goto no_delete; 3719 3677 } 3720 3678 ··· 4110 4066 4111 4067 BTRFS_I(inode)->root = root; 4112 4068 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4113 - BTRFS_I(inode)->dummy_inode = 1; 4069 + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 4114 4070 4115 4071 inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; 4116 4072 inode->i_op = &btrfs_dir_ro_inode_operations; ··· 4414 4370 int ret = 0; 4415 4371 bool nolock = false; 4416 4372 4417 - if (BTRFS_I(inode)->dummy_inode) 4373 + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4418 4374 return 0; 4419 4375 4420 4376 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) ··· 4447 4403 struct btrfs_trans_handle *trans; 4448 4404 int ret; 4449 4405 4450 - if (BTRFS_I(inode)->dummy_inode) 4406 + if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) 4451 4407 return 0; 4452 4408 4453 4409 trans = btrfs_join_transaction(root); ··· 4774 4730 4775 4731 btrfs_i_size_write(parent_inode, parent_inode->i_size + 4776 4732 name_len * 2); 4733 + inode_inc_iversion(parent_inode); 4777 4734 parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; 4778 4735 ret = btrfs_update_inode(trans, root, parent_inode); 4779 4736 if (ret) ··· 4982 4937 } 4983 4938 4984 4939 btrfs_inc_nlink(inode); 4940 + inode_inc_iversion(inode); 4985 4941 inode->i_ctime = CURRENT_TIME; 4986 4942 ihold(inode); 4987 4943 ··· 5949 5903 struct btrfs_dio_private *dip = bio->bi_private; 5950 5904 struct inode *inode = dip->inode; 5951 5905 struct btrfs_root *root = BTRFS_I(inode)->root; 5952 - struct btrfs_trans_handle *trans; 5953 5906 struct btrfs_ordered_extent *ordered = NULL; 5954 - struct extent_state *cached_state = NULL; 5955 5907 u64 ordered_offset = dip->logical_offset; 5956 5908 u64 ordered_bytes = dip->bytes; 5957 5909 int ret; ··· 5959 5915 again: 5960 5916 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, 5961 5917 &ordered_offset, 5962 - ordered_bytes); 5918 + ordered_bytes, !err); 5963 5919 if (!ret) 5964 5920 goto out_test; 5965 5921 5966 - BUG_ON(!ordered); 5967 - 5968 - trans = btrfs_join_transaction(root); 5969 - if (IS_ERR(trans)) { 5970 - err = -ENOMEM; 5971 - goto out; 5972 - } 5973 - trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5974 - 5975 - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5976 - ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5977 - if (!ret) 5978 - err = btrfs_update_inode_fallback(trans, root, inode); 5979 - goto out; 5980 - } 5981 - 5982 - lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5983 - ordered->file_offset + ordered->len - 1, 0, 5984 - &cached_state); 5985 - 5986 - if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { 5987 - ret = btrfs_mark_extent_written(trans, inode, 5988 - ordered->file_offset, 5989 - ordered->file_offset + 5990 - ordered->len); 5991 - if (ret) { 5992 - err = ret; 5993 - goto out_unlock; 5994 - } 5995 - } else { 5996 - ret = insert_reserved_file_extent(trans, inode, 5997 - ordered->file_offset, 5998 - ordered->start, 5999 - ordered->disk_len, 6000 - ordered->len, 6001 - ordered->len, 6002 - 0, 0, 0, 6003 - BTRFS_FILE_EXTENT_REG); 6004 - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 6005 - ordered->file_offset, ordered->len); 6006 - if (ret) { 6007 - err = ret; 6008 - WARN_ON(1); 6009 - goto out_unlock; 6010 - } 6011 - } 6012 - 6013 - add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 6014 - ret = btrfs_ordered_update_i_size(inode, 0, ordered); 6015 - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 6016 - btrfs_update_inode_fallback(trans, root, inode); 6017 - ret = 0; 6018 - out_unlock: 6019 - unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 6020 - ordered->file_offset + ordered->len - 1, 6021 - &cached_state, GFP_NOFS); 6022 - out: 6023 - btrfs_delalloc_release_metadata(inode, ordered->len); 6024 - btrfs_end_transaction(trans, root); 6025 - ordered_offset = ordered->file_offset + ordered->len; 6026 - btrfs_put_ordered_extent(ordered); 6027 - btrfs_put_ordered_extent(ordered); 6028 - 5922 + ordered->work.func = finish_ordered_fn; 5923 + ordered->work.flags = 0; 5924 + btrfs_queue_worker(&root->fs_info->endio_write_workers, 5925 + &ordered->work); 6029 5926 out_test: 6030 5927 /* 6031 5928 * our bio might span multiple ordered extents. If we haven't ··· 5975 5990 if (ordered_offset < dip->logical_offset + dip->bytes) { 5976 5991 ordered_bytes = dip->logical_offset + dip->bytes - 5977 5992 ordered_offset; 5993 + ordered = NULL; 5978 5994 goto again; 5979 5995 } 5980 5996 out_done: 5981 5997 bio->bi_private = dip->private; 5982 5998 5983 - kfree(dip->csums); 5984 5999 kfree(dip); 5985 6000 5986 6001 /* If we had an error make sure to clear the uptodate flag */ ··· 6048 6063 int ret; 6049 6064 6050 6065 bio_get(bio); 6051 - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6052 - if (ret) 6053 - goto err; 6066 + 6067 + if (!write) { 6068 + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6069 + if (ret) 6070 + goto err; 6071 + } 6054 6072 6055 6073 if (skip_sum) 6056 6074 goto map; ··· 6473 6485 6474 6486 static void btrfs_invalidatepage(struct page *page, unsigned long offset) 6475 6487 { 6488 + struct inode *inode = page->mapping->host; 6476 6489 struct extent_io_tree *tree; 6477 6490 struct btrfs_ordered_extent *ordered; 6478 6491 struct extent_state *cached_state = NULL; 6479 6492 u64 page_start = page_offset(page); 6480 6493 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6481 - 6482 6494 6483 6495 /* 6484 6496 * we have the page locked, so new writeback can't start, ··· 6489 6501 */ 6490 6502 wait_on_page_writeback(page); 6491 6503 6492 - tree = &BTRFS_I(page->mapping->host)->io_tree; 6504 + tree = &BTRFS_I(inode)->io_tree; 6493 6505 if (offset) { 6494 6506 btrfs_releasepage(page, GFP_NOFS); 6495 6507 return; 6496 6508 } 6497 6509 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 6498 - ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6510 + ordered = btrfs_lookup_ordered_extent(inode, 6499 6511 page_offset(page)); 6500 6512 if (ordered) { 6501 6513 /* ··· 6510 6522 * whoever cleared the private bit is responsible 6511 6523 * for the finish_ordered_io 6512 6524 */ 6513 - if (TestClearPagePrivate2(page)) { 6514 - btrfs_finish_ordered_io(page->mapping->host, 6515 - page_start, page_end); 6525 + if (TestClearPagePrivate2(page) && 6526 + btrfs_dec_test_ordered_pending(inode, &ordered, page_start, 6527 + PAGE_CACHE_SIZE, 1)) { 6528 + btrfs_finish_ordered_io(ordered); 6516 6529 } 6517 6530 btrfs_put_ordered_extent(ordered); 6518 6531 cached_state = NULL; ··· 6760 6771 * using truncate to replace the contents of the file will 6761 6772 * end up with a zero length file after a crash. 6762 6773 */ 6763 - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) 6774 + if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 6775 + &BTRFS_I(inode)->runtime_flags)) 6764 6776 btrfs_add_ordered_operation(trans, root, inode); 6765 6777 6766 6778 while (1) { ··· 6884 6894 ei->root = NULL; 6885 6895 ei->space_info = NULL; 6886 6896 ei->generation = 0; 6887 - ei->sequence = 0; 6888 6897 ei->last_trans = 0; 6889 6898 ei->last_sub_trans = 0; 6890 6899 ei->logged_trans = 0; ··· 6898 6909 ei->outstanding_extents = 0; 6899 6910 ei->reserved_extents = 0; 6900 6911 6901 - ei->ordered_data_close = 0; 6902 - ei->orphan_meta_reserved = 0; 6903 - ei->dummy_inode = 0; 6904 - ei->in_defrag = 0; 6905 - ei->delalloc_meta_reserved = 0; 6912 + ei->runtime_flags = 0; 6906 6913 ei->force_compress = BTRFS_COMPRESS_NONE; 6907 6914 6908 6915 ei->delayed_node = NULL; ··· 6912 6927 mutex_init(&ei->log_mutex); 6913 6928 mutex_init(&ei->delalloc_mutex); 6914 6929 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6915 - INIT_LIST_HEAD(&ei->i_orphan); 6916 6930 INIT_LIST_HEAD(&ei->delalloc_inodes); 6917 6931 INIT_LIST_HEAD(&ei->ordered_operations); 6918 6932 RB_CLEAR_NODE(&ei->rb_node); ··· 6956 6972 spin_unlock(&root->fs_info->ordered_extent_lock); 6957 6973 } 6958 6974 6959 - spin_lock(&root->orphan_lock); 6960 - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6975 + if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 6976 + &BTRFS_I(inode)->runtime_flags)) { 6961 6977 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6962 6978 (unsigned long long)btrfs_ino(inode)); 6963 - list_del_init(&BTRFS_I(inode)->i_orphan); 6979 + atomic_dec(&root->orphan_inodes); 6964 6980 } 6965 - spin_unlock(&root->orphan_lock); 6966 6981 6967 6982 while (1) { 6968 6983 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); ··· 7176 7193 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) 7177 7194 btrfs_add_ordered_operation(trans, root, old_inode); 7178 7195 7196 + inode_inc_iversion(old_dir); 7197 + inode_inc_iversion(new_dir); 7198 + inode_inc_iversion(old_inode); 7179 7199 old_dir->i_ctime = old_dir->i_mtime = ctime; 7180 7200 new_dir->i_ctime = new_dir->i_mtime = ctime; 7181 7201 old_inode->i_ctime = ctime; ··· 7205 7219 } 7206 7220 7207 7221 if (new_inode) { 7222 + inode_inc_iversion(new_inode); 7208 7223 new_inode->i_ctime = CURRENT_TIME; 7209 7224 if (unlikely(btrfs_ino(new_inode) == 7210 7225 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ··· 7477 7490 cur_offset += ins.offset; 7478 7491 *alloc_hint = ins.objectid + ins.offset; 7479 7492 7493 + inode_inc_iversion(inode); 7480 7494 inode->i_ctime = CURRENT_TIME; 7481 7495 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7482 7496 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+43 -7
fs/btrfs/ioctl.c
··· 261 261 } 262 262 263 263 btrfs_update_iflags(inode); 264 + inode_inc_iversion(inode); 264 265 inode->i_ctime = CURRENT_TIME; 265 266 ret = btrfs_update_inode(trans, root, inode); 266 267 ··· 368 367 return PTR_ERR(trans); 369 368 370 369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 371 - 0, objectid, NULL, 0, 0, 0, 0); 370 + 0, objectid, NULL, 0, 0, 0); 372 371 if (IS_ERR(leaf)) { 373 372 ret = PTR_ERR(leaf); 374 373 goto fail; ··· 2263 2262 di_args->bytes_used = dev->bytes_used; 2264 2263 di_args->total_bytes = dev->total_bytes; 2265 2264 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2266 - if (dev->name) 2265 + if (dev->name) { 2267 2266 strncpy(di_args->path, dev->name, sizeof(di_args->path)); 2268 - else 2267 + di_args->path[sizeof(di_args->path) - 1] = 0; 2268 + } else { 2269 2269 di_args->path[0] = '\0'; 2270 + } 2270 2271 2271 2272 out: 2272 2273 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ··· 2625 2622 btrfs_mark_buffer_dirty(leaf); 2626 2623 btrfs_release_path(path); 2627 2624 2625 + inode_inc_iversion(inode); 2628 2626 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2629 2627 2630 2628 /* ··· 2918 2914 up_read(&info->groups_sem); 2919 2915 } 2920 2916 2921 - user_dest = (struct btrfs_ioctl_space_info *) 2917 + user_dest = (struct btrfs_ioctl_space_info __user *) 2922 2918 (arg + sizeof(struct btrfs_ioctl_space_args)); 2923 2919 2924 2920 if (copy_to_user(user_dest, dest_orig, alloc_size)) ··· 3038 3034 return PTR_ERR(sa); 3039 3035 3040 3036 ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); 3037 + 3038 + if (copy_to_user(arg, sa, sizeof(*sa))) 3039 + ret = -EFAULT; 3040 + 3041 + kfree(sa); 3042 + return ret; 3043 + } 3044 + 3045 + static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, 3046 + void __user *arg, int reset_after_read) 3047 + { 3048 + struct btrfs_ioctl_get_dev_stats *sa; 3049 + int ret; 3050 + 3051 + if (reset_after_read && !capable(CAP_SYS_ADMIN)) 3052 + return -EPERM; 3053 + 3054 + sa = memdup_user(arg, sizeof(*sa)); 3055 + if (IS_ERR(sa)) 3056 + return PTR_ERR(sa); 3057 + 3058 + ret = btrfs_get_dev_stats(root, sa, reset_after_read); 3041 3059 3042 3060 if (copy_to_user(arg, sa, sizeof(*sa))) 3043 3061 ret = -EFAULT; ··· 3238 3212 } 3239 3213 } 3240 3214 3241 - static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) 3215 + static long btrfs_ioctl_balance(struct file *file, void __user *arg) 3242 3216 { 3217 + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3243 3218 struct btrfs_fs_info *fs_info = root->fs_info; 3244 3219 struct btrfs_ioctl_balance_args *bargs; 3245 3220 struct btrfs_balance_control *bctl; ··· 3251 3224 3252 3225 if (fs_info->sb->s_flags & MS_RDONLY) 3253 3226 return -EROFS; 3227 + 3228 + ret = mnt_want_write(file->f_path.mnt); 3229 + if (ret) 3230 + return ret; 3254 3231 3255 3232 mutex_lock(&fs_info->volume_mutex); 3256 3233 mutex_lock(&fs_info->balance_mutex); ··· 3322 3291 out: 3323 3292 mutex_unlock(&fs_info->balance_mutex); 3324 3293 mutex_unlock(&fs_info->volume_mutex); 3294 + mnt_drop_write(file->f_path.mnt); 3325 3295 return ret; 3326 3296 } 3327 3297 ··· 3418 3386 case BTRFS_IOC_DEV_INFO: 3419 3387 return btrfs_ioctl_dev_info(root, argp); 3420 3388 case BTRFS_IOC_BALANCE: 3421 - return btrfs_ioctl_balance(root, NULL); 3389 + return btrfs_ioctl_balance(file, NULL); 3422 3390 case BTRFS_IOC_CLONE: 3423 3391 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3424 3392 case BTRFS_IOC_CLONE_RANGE: ··· 3451 3419 case BTRFS_IOC_SCRUB_PROGRESS: 3452 3420 return btrfs_ioctl_scrub_progress(root, argp); 3453 3421 case BTRFS_IOC_BALANCE_V2: 3454 - return btrfs_ioctl_balance(root, argp); 3422 + return btrfs_ioctl_balance(file, argp); 3455 3423 case BTRFS_IOC_BALANCE_CTL: 3456 3424 return btrfs_ioctl_balance_ctl(root, arg); 3457 3425 case BTRFS_IOC_BALANCE_PROGRESS: 3458 3426 return btrfs_ioctl_balance_progress(root, argp); 3427 + case BTRFS_IOC_GET_DEV_STATS: 3428 + return btrfs_ioctl_get_dev_stats(root, argp, 0); 3429 + case BTRFS_IOC_GET_AND_RESET_DEV_STATS: 3430 + return btrfs_ioctl_get_dev_stats(root, argp, 1); 3459 3431 } 3460 3432 3461 3433 return -ENOTTY;
+33
fs/btrfs/ioctl.h
··· 266 266 __u64 inodes; 267 267 }; 268 268 269 + enum btrfs_dev_stat_values { 270 + /* disk I/O failure stats */ 271 + BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ 272 + BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ 273 + BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ 274 + 275 + /* stats for indirect indications for I/O failures */ 276 + BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or 277 + * contents is illegal: this is an 278 + * indication that the block was damaged 279 + * during read or write, or written to 280 + * wrong location or read from wrong 281 + * location */ 282 + BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not 283 + * been written */ 284 + 285 + BTRFS_DEV_STAT_VALUES_MAX 286 + }; 287 + 288 + struct btrfs_ioctl_get_dev_stats { 289 + __u64 devid; /* in */ 290 + __u64 nr_items; /* in/out */ 291 + 292 + /* out values: */ 293 + __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; 294 + 295 + __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ 296 + }; 297 + 269 298 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 270 299 struct btrfs_ioctl_vol_args) 271 300 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ ··· 359 330 struct btrfs_ioctl_ino_path_args) 360 331 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 361 332 struct btrfs_ioctl_ino_path_args) 333 + #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 334 + struct btrfs_ioctl_get_dev_stats) 335 + #define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ 336 + struct btrfs_ioctl_get_dev_stats) 362 337 363 338 #endif
+77 -88
fs/btrfs/ordered-data.c
··· 196 196 entry->len = len; 197 197 entry->disk_len = disk_len; 198 198 entry->bytes_left = len; 199 - entry->inode = inode; 199 + entry->inode = igrab(inode); 200 200 entry->compress_type = compress_type; 201 201 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 202 202 set_bit(type, &entry->flags); ··· 212 212 213 213 trace_btrfs_ordered_extent_add(inode, entry); 214 214 215 - spin_lock(&tree->lock); 215 + spin_lock_irq(&tree->lock); 216 216 node = tree_insert(&tree->tree, file_offset, 217 217 &entry->rb_node); 218 218 if (node) 219 219 ordered_data_tree_panic(inode, -EEXIST, file_offset); 220 - spin_unlock(&tree->lock); 220 + spin_unlock_irq(&tree->lock); 221 221 222 222 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 223 223 list_add_tail(&entry->root_extent_list, ··· 264 264 struct btrfs_ordered_inode_tree *tree; 265 265 266 266 tree = &BTRFS_I(inode)->ordered_tree; 267 - spin_lock(&tree->lock); 267 + spin_lock_irq(&tree->lock); 268 268 list_add_tail(&sum->list, &entry->list); 269 - spin_unlock(&tree->lock); 269 + spin_unlock_irq(&tree->lock); 270 270 } 271 271 272 272 /* ··· 283 283 */ 284 284 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 285 285 struct btrfs_ordered_extent **cached, 286 - u64 *file_offset, u64 io_size) 286 + u64 *file_offset, u64 io_size, int uptodate) 287 287 { 288 288 struct btrfs_ordered_inode_tree *tree; 289 289 struct rb_node *node; 290 290 struct btrfs_ordered_extent *entry = NULL; 291 291 int ret; 292 + unsigned long flags; 292 293 u64 dec_end; 293 294 u64 dec_start; 294 295 u64 to_dec; 295 296 296 297 tree = &BTRFS_I(inode)->ordered_tree; 297 - spin_lock(&tree->lock); 298 + spin_lock_irqsave(&tree->lock, flags); 298 299 node = tree_search(tree, *file_offset); 299 300 if (!node) { 300 301 ret = 1; ··· 324 323 (unsigned long long)to_dec); 325 324 } 326 325 entry->bytes_left -= to_dec; 326 + if (!uptodate) 327 + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 328 + 327 329 if (entry->bytes_left == 0) 328 330 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 329 331 else ··· 336 332 *cached = entry; 337 333 atomic_inc(&entry->refs); 338 334 } 339 - spin_unlock(&tree->lock); 335 + spin_unlock_irqrestore(&tree->lock, flags); 340 336 return ret == 0; 341 337 } 342 338 ··· 351 347 */ 352 348 int btrfs_dec_test_ordered_pending(struct inode *inode, 353 349 struct btrfs_ordered_extent **cached, 354 - u64 file_offset, u64 io_size) 350 + u64 file_offset, u64 io_size, int uptodate) 355 351 { 356 352 struct btrfs_ordered_inode_tree *tree; 357 353 struct rb_node *node; 358 354 struct btrfs_ordered_extent *entry = NULL; 355 + unsigned long flags; 359 356 int ret; 360 357 361 358 tree = &BTRFS_I(inode)->ordered_tree; 362 - spin_lock(&tree->lock); 359 + spin_lock_irqsave(&tree->lock, flags); 360 + if (cached && *cached) { 361 + entry = *cached; 362 + goto have_entry; 363 + } 364 + 363 365 node = tree_search(tree, file_offset); 364 366 if (!node) { 365 367 ret = 1; ··· 373 363 } 374 364 375 365 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 366 + have_entry: 376 367 if (!offset_in_entry(entry, file_offset)) { 377 368 ret = 1; 378 369 goto out; ··· 385 374 (unsigned long long)io_size); 386 375 } 387 376 entry->bytes_left -= io_size; 377 + if (!uptodate) 378 + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 379 + 388 380 if (entry->bytes_left == 0) 389 381 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 390 382 else ··· 397 383 *cached = entry; 398 384 atomic_inc(&entry->refs); 399 385 } 400 - spin_unlock(&tree->lock); 386 + spin_unlock_irqrestore(&tree->lock, flags); 401 387 return ret == 0; 402 388 } 403 389 ··· 413 399 trace_btrfs_ordered_extent_put(entry->inode, entry); 414 400 415 401 if (atomic_dec_and_test(&entry->refs)) { 402 + if (entry->inode) 403 + btrfs_add_delayed_iput(entry->inode); 416 404 while (!list_empty(&entry->list)) { 417 405 cur = entry->list.next; 418 406 sum = list_entry(cur, struct btrfs_ordered_sum, list); ··· 427 411 428 412 /* 429 413 * remove an ordered extent from the tree. No references are dropped 430 - * and you must wake_up entry->wait. You must hold the tree lock 431 - * while you call this function. 414 + * and waiters are woken up. 432 415 */ 433 - static void __btrfs_remove_ordered_extent(struct inode *inode, 434 - struct btrfs_ordered_extent *entry) 416 + void btrfs_remove_ordered_extent(struct inode *inode, 417 + struct btrfs_ordered_extent *entry) 435 418 { 436 419 struct btrfs_ordered_inode_tree *tree; 437 420 struct btrfs_root *root = BTRFS_I(inode)->root; 438 421 struct rb_node *node; 439 422 440 423 tree = &BTRFS_I(inode)->ordered_tree; 424 + spin_lock_irq(&tree->lock); 441 425 node = &entry->rb_node; 442 426 rb_erase(node, &tree->tree); 443 427 tree->last = NULL; 444 428 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 429 + spin_unlock_irq(&tree->lock); 445 430 446 431 spin_lock(&root->fs_info->ordered_extent_lock); 447 432 list_del_init(&entry->root_extent_list); ··· 459 442 list_del_init(&BTRFS_I(inode)->ordered_operations); 460 443 } 461 444 spin_unlock(&root->fs_info->ordered_extent_lock); 462 - } 463 - 464 - /* 465 - * remove an ordered extent from the tree. No references are dropped 466 - * but any waiters are woken. 467 - */ 468 - void btrfs_remove_ordered_extent(struct inode *inode, 469 - struct btrfs_ordered_extent *entry) 470 - { 471 - struct btrfs_ordered_inode_tree *tree; 472 - 473 - tree = &BTRFS_I(inode)->ordered_tree; 474 - spin_lock(&tree->lock); 475 - __btrfs_remove_ordered_extent(inode, entry); 476 - spin_unlock(&tree->lock); 477 445 wake_up(&entry->wait); 478 446 } 479 447 ··· 623 621 if (orig_end > INT_LIMIT(loff_t)) 624 622 orig_end = INT_LIMIT(loff_t); 625 623 } 626 - again: 624 + 627 625 /* start IO across the range first to instantiate any delalloc 628 626 * extents 629 627 */ 630 - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 631 - 632 - /* The compression code will leave pages locked but return from 633 - * writepage without setting the page writeback. Starting again 634 - * with WB_SYNC_ALL will end up waiting for the IO to actually start. 635 - */ 636 - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 637 - 638 - filemap_fdatawait_range(inode->i_mapping, start, orig_end); 628 + filemap_write_and_wait_range(inode->i_mapping, start, orig_end); 639 629 640 630 end = orig_end; 641 631 found = 0; ··· 651 657 break; 652 658 end--; 653 659 } 654 - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, 655 - EXTENT_DELALLOC, 0, NULL)) { 656 - schedule_timeout(1); 657 - goto again; 658 - } 659 660 } 660 661 661 662 /* ··· 665 676 struct btrfs_ordered_extent *entry = NULL; 666 677 667 678 tree = &BTRFS_I(inode)->ordered_tree; 668 - spin_lock(&tree->lock); 679 + spin_lock_irq(&tree->lock); 669 680 node = tree_search(tree, file_offset); 670 681 if (!node) 671 682 goto out; ··· 676 687 if (entry) 677 688 atomic_inc(&entry->refs); 678 689 out: 679 - spin_unlock(&tree->lock); 690 + spin_unlock_irq(&tree->lock); 680 691 return entry; 681 692 } 682 693 ··· 692 703 struct btrfs_ordered_extent *entry = NULL; 693 704 694 705 tree = &BTRFS_I(inode)->ordered_tree; 695 - spin_lock(&tree->lock); 706 + spin_lock_irq(&tree->lock); 696 707 node = tree_search(tree, file_offset); 697 708 if (!node) { 698 709 node = tree_search(tree, file_offset + len); ··· 717 728 out: 718 729 if (entry) 719 730 atomic_inc(&entry->refs); 720 - spin_unlock(&tree->lock); 731 + spin_unlock_irq(&tree->lock); 721 732 return entry; 722 733 } 723 734 ··· 733 744 struct btrfs_ordered_extent *entry = NULL; 734 745 735 746 tree = &BTRFS_I(inode)->ordered_tree; 736 - spin_lock(&tree->lock); 747 + spin_lock_irq(&tree->lock); 737 748 node = tree_search(tree, file_offset); 738 749 if (!node) 739 750 goto out; ··· 741 752 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 742 753 atomic_inc(&entry->refs); 743 754 out: 744 - spin_unlock(&tree->lock); 755 + spin_unlock_irq(&tree->lock); 745 756 return entry; 746 757 } 747 758 ··· 753 764 struct btrfs_ordered_extent *ordered) 754 765 { 755 766 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 756 - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 757 767 u64 disk_i_size; 758 768 u64 new_i_size; 759 769 u64 i_size_test; ··· 767 779 else 768 780 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 769 781 770 - spin_lock(&tree->lock); 782 + spin_lock_irq(&tree->lock); 771 783 disk_i_size = BTRFS_I(inode)->disk_i_size; 772 784 773 785 /* truncate file */ ··· 785 797 goto out; 786 798 } 787 799 788 - /* 789 - * we can't update the disk_isize if there are delalloc bytes 790 - * between disk_i_size and this ordered extent 791 - */ 792 - if (test_range_bit(io_tree, disk_i_size, offset - 1, 793 - EXTENT_DELALLOC, 0, NULL)) { 794 - goto out; 795 - } 796 800 /* 797 801 * walk backward from this ordered extent to disk_i_size. 798 802 * if we find an ordered extent then we can't update disk i_size ··· 805 825 } 806 826 node = prev; 807 827 } 808 - while (node) { 828 + for (; node; node = rb_prev(node)) { 809 829 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 830 + 831 + /* We treat this entry as if it doesnt exist */ 832 + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 833 + continue; 810 834 if (test->file_offset + test->len <= disk_i_size) 811 835 break; 812 836 if (test->file_offset >= i_size) 813 837 break; 814 838 if (test->file_offset >= disk_i_size) 815 839 goto out; 816 - node = rb_prev(node); 817 840 } 818 841 new_i_size = min_t(u64, offset, i_size); 819 842 ··· 834 851 else 835 852 node = rb_first(&tree->tree); 836 853 } 837 - i_size_test = 0; 838 - if (node) { 839 - /* 840 - * do we have an area where IO might have finished 841 - * between our ordered extent and the next one. 842 - */ 854 + 855 + /* 856 + * We are looking for an area between our current extent and the next 857 + * ordered extent to update the i_size to. There are 3 cases here 858 + * 859 + * 1) We don't actually have anything and we can update to i_size. 860 + * 2) We have stuff but they already did their i_size update so again we 861 + * can just update to i_size. 862 + * 3) We have an outstanding ordered extent so the most we can update 863 + * our disk_i_size to is the start of the next offset. 864 + */ 865 + i_size_test = i_size; 866 + for (; node; node = rb_next(node)) { 843 867 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 844 - if (test->file_offset > offset) 868 + 869 + if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 870 + continue; 871 + if (test->file_offset > offset) { 845 872 i_size_test = test->file_offset; 846 - } else { 847 - i_size_test = i_size; 873 + break; 874 + } 848 875 } 849 876 850 877 /* 851 878 * i_size_test is the end of a region after this ordered 852 - * extent where there are no ordered extents. As long as there 853 - * are no delalloc bytes in this area, it is safe to update 854 - * disk_i_size to the end of the region. 879 + * extent where there are no ordered extents, we can safely set 880 + * disk_i_size to this. 855 881 */ 856 - if (i_size_test > offset && 857 - !test_range_bit(io_tree, offset, i_size_test - 1, 858 - EXTENT_DELALLOC, 0, NULL)) { 882 + if (i_size_test > offset) 859 883 new_i_size = min_t(u64, i_size_test, i_size); 860 - } 861 884 BTRFS_I(inode)->disk_i_size = new_i_size; 862 885 ret = 0; 863 886 out: 864 887 /* 865 - * we need to remove the ordered extent with the tree lock held 866 - * so that other people calling this function don't find our fully 867 - * processed ordered entry and skip updating the i_size 888 + * We need to do this because we can't remove ordered extents until 889 + * after the i_disk_size has been updated and then the inode has been 890 + * updated to reflect the change, so we need to tell anybody who finds 891 + * this ordered extent that we've already done all the real work, we 892 + * just haven't completed all the other work. 868 893 */ 869 894 if (ordered) 870 - __btrfs_remove_ordered_extent(inode, ordered); 871 - spin_unlock(&tree->lock); 872 - if (ordered) 873 - wake_up(&ordered->wait); 895 + set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); 896 + spin_unlock_irq(&tree->lock); 874 897 return ret; 875 898 } 876 899 ··· 901 912 if (!ordered) 902 913 return 1; 903 914 904 - spin_lock(&tree->lock); 915 + spin_lock_irq(&tree->lock); 905 916 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 906 917 if (disk_bytenr >= ordered_sum->bytenr) { 907 918 num_sectors = ordered_sum->len / sectorsize; ··· 916 927 } 917 928 } 918 929 out: 919 - spin_unlock(&tree->lock); 930 + spin_unlock_irq(&tree->lock); 920 931 btrfs_put_ordered_extent(ordered); 921 932 return ret; 922 933 }
+11 -2
fs/btrfs/ordered-data.h
··· 74 74 75 75 #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ 76 76 77 + #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ 78 + 79 + #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent 80 + * has done its due diligence in updating 81 + * the isize. */ 82 + 77 83 struct btrfs_ordered_extent { 78 84 /* logical offset in the file */ 79 85 u64 file_offset; ··· 119 113 120 114 /* a per root list of all the pending ordered extents */ 121 115 struct list_head root_extent_list; 116 + 117 + struct btrfs_work work; 122 118 }; 123 119 124 120 ··· 151 143 struct btrfs_ordered_extent *entry); 152 144 int btrfs_dec_test_ordered_pending(struct inode *inode, 153 145 struct btrfs_ordered_extent **cached, 154 - u64 file_offset, u64 io_size); 146 + u64 file_offset, u64 io_size, int uptodate); 155 147 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 156 148 struct btrfs_ordered_extent **cached, 157 - u64 *file_offset, u64 io_size); 149 + u64 *file_offset, u64 io_size, 150 + int uptodate); 158 151 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 159 152 u64 start, u64 len, u64 disk_len, int type); 160 153 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+3
fs/btrfs/print-tree.c
··· 294 294 btrfs_dev_extent_chunk_offset(l, dev_extent), 295 295 (unsigned long long) 296 296 btrfs_dev_extent_length(l, dev_extent)); 297 + case BTRFS_DEV_STATS_KEY: 298 + printk(KERN_INFO "\t\tdevice stats\n"); 299 + break; 297 300 }; 298 301 } 299 302 }
+5
fs/btrfs/reada.c
··· 718 718 { 719 719 struct reada_machine_work *rmw; 720 720 struct btrfs_fs_info *fs_info; 721 + int old_ioprio; 721 722 722 723 rmw = container_of(work, struct reada_machine_work, work); 723 724 fs_info = rmw->fs_info; 724 725 725 726 kfree(rmw); 726 727 728 + old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current), 729 + task_nice_ioprio(current)); 730 + set_task_ioprio(current, BTRFS_IOPRIO_READA); 727 731 __reada_start_machine(fs_info); 732 + set_task_ioprio(current, old_ioprio); 728 733 } 729 734 730 735 static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+49 -16
fs/btrfs/scrub.c
··· 50 50 struct scrub_page { 51 51 struct scrub_block *sblock; 52 52 struct page *page; 53 - struct block_device *bdev; 53 + struct btrfs_device *dev; 54 54 u64 flags; /* extent flags */ 55 55 u64 generation; 56 56 u64 logical; ··· 86 86 unsigned int header_error:1; 87 87 unsigned int checksum_error:1; 88 88 unsigned int no_io_error_seen:1; 89 + unsigned int generation_error:1; /* also sets header_error */ 89 90 }; 90 91 }; 91 92 ··· 676 675 sdev->stat.read_errors++; 677 676 sdev->stat.uncorrectable_errors++; 678 677 spin_unlock(&sdev->stat_lock); 678 + btrfs_dev_stat_inc_and_print(sdev->dev, 679 + BTRFS_DEV_STAT_READ_ERRS); 679 680 goto out; 680 681 } 681 682 ··· 689 686 sdev->stat.read_errors++; 690 687 sdev->stat.uncorrectable_errors++; 691 688 spin_unlock(&sdev->stat_lock); 689 + btrfs_dev_stat_inc_and_print(sdev->dev, 690 + BTRFS_DEV_STAT_READ_ERRS); 692 691 goto out; 693 692 } 694 693 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); ··· 704 699 sdev->stat.read_errors++; 705 700 sdev->stat.uncorrectable_errors++; 706 701 spin_unlock(&sdev->stat_lock); 702 + btrfs_dev_stat_inc_and_print(sdev->dev, 703 + BTRFS_DEV_STAT_READ_ERRS); 707 704 goto out; 708 705 } 709 706 ··· 732 725 spin_unlock(&sdev->stat_lock); 733 726 if (__ratelimit(&_rs)) 734 727 scrub_print_warning("i/o error", sblock_to_check); 728 + btrfs_dev_stat_inc_and_print(sdev->dev, 729 + BTRFS_DEV_STAT_READ_ERRS); 735 730 } else if (sblock_bad->checksum_error) { 736 731 spin_lock(&sdev->stat_lock); 737 732 sdev->stat.csum_errors++; 738 733 spin_unlock(&sdev->stat_lock); 739 734 if (__ratelimit(&_rs)) 740 735 scrub_print_warning("checksum error", sblock_to_check); 736 + btrfs_dev_stat_inc_and_print(sdev->dev, 737 + BTRFS_DEV_STAT_CORRUPTION_ERRS); 741 738 } else if (sblock_bad->header_error) { 742 739 spin_lock(&sdev->stat_lock); 743 740 sdev->stat.verify_errors++; ··· 749 738 if (__ratelimit(&_rs)) 750 739 scrub_print_warning("checksum/header error", 751 740 sblock_to_check); 741 + if (sblock_bad->generation_error) 742 + btrfs_dev_stat_inc_and_print(sdev->dev, 743 + BTRFS_DEV_STAT_GENERATION_ERRS); 744 + else 745 + btrfs_dev_stat_inc_and_print(sdev->dev, 746 + BTRFS_DEV_STAT_CORRUPTION_ERRS); 752 747 } 753 748 754 749 if (sdev->readonly) ··· 1015 998 page = sblock->pagev + page_index; 1016 999 page->logical = logical; 1017 1000 page->physical = bbio->stripes[mirror_index].physical; 1018 - /* for missing devices, bdev is NULL */ 1019 - page->bdev = bbio->stripes[mirror_index].dev->bdev; 1001 + /* for missing devices, dev->bdev is NULL */ 1002 + page->dev = bbio->stripes[mirror_index].dev; 1020 1003 page->mirror_num = mirror_index + 1; 1021 1004 page->page = alloc_page(GFP_NOFS); 1022 1005 if (!page->page) { ··· 1060 1043 struct scrub_page *page = sblock->pagev + page_num; 1061 1044 DECLARE_COMPLETION_ONSTACK(complete); 1062 1045 1063 - if (page->bdev == NULL) { 1046 + if (page->dev->bdev == NULL) { 1064 1047 page->io_error = 1; 1065 1048 sblock->no_io_error_seen = 0; 1066 1049 continue; ··· 1070 1053 bio = bio_alloc(GFP_NOFS, 1); 1071 1054 if (!bio) 1072 1055 return -EIO; 1073 - bio->bi_bdev = page->bdev; 1056 + bio->bi_bdev = page->dev->bdev; 1074 1057 bio->bi_sector = page->physical >> 9; 1075 1058 bio->bi_end_io = scrub_complete_bio_end_io; 1076 1059 bio->bi_private = &complete; ··· 1119 1102 h = (struct btrfs_header *)mapped_buffer; 1120 1103 1121 1104 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1122 - generation != le64_to_cpu(h->generation) || 1123 1105 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1124 1106 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1125 - BTRFS_UUID_SIZE)) 1107 + BTRFS_UUID_SIZE)) { 1126 1108 sblock->header_error = 1; 1109 + } else if (generation != le64_to_cpu(h->generation)) { 1110 + sblock->header_error = 1; 1111 + sblock->generation_error = 1; 1112 + } 1127 1113 csum = h->csum; 1128 1114 } else { 1129 1115 if (!have_csum) ··· 1202 1182 bio = bio_alloc(GFP_NOFS, 1); 1203 1183 if (!bio) 1204 1184 return -EIO; 1205 - bio->bi_bdev = page_bad->bdev; 1185 + bio->bi_bdev = page_bad->dev->bdev; 1206 1186 bio->bi_sector = page_bad->physical >> 9; 1207 1187 bio->bi_end_io = scrub_complete_bio_end_io; 1208 1188 bio->bi_private = &complete; ··· 1216 1196 1217 1197 /* this will also unplug the queue */ 1218 1198 wait_for_completion(&complete); 1199 + if (!bio_flagged(bio, BIO_UPTODATE)) { 1200 + btrfs_dev_stat_inc_and_print(page_bad->dev, 1201 + BTRFS_DEV_STAT_WRITE_ERRS); 1202 + bio_put(bio); 1203 + return -EIO; 1204 + } 1219 1205 bio_put(bio); 1220 1206 } 1221 1207 ··· 1378 1352 u64 mapped_size; 1379 1353 void *p; 1380 1354 u32 crc = ~(u32)0; 1381 - int fail = 0; 1355 + int fail_gen = 0; 1356 + int fail_cor = 0; 1382 1357 u64 len; 1383 1358 int index; 1384 1359 ··· 1390 1363 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1391 1364 1392 1365 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1393 - ++fail; 1366 + ++fail_cor; 1394 1367 1395 1368 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1396 - ++fail; 1369 + ++fail_gen; 1397 1370 1398 1371 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1399 - ++fail; 1372 + ++fail_cor; 1400 1373 1401 1374 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1402 1375 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; ··· 1421 1394 1422 1395 btrfs_csum_final(crc, calculated_csum); 1423 1396 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1424 - ++fail; 1397 + ++fail_cor; 1425 1398 1426 - if (fail) { 1399 + if (fail_cor + fail_gen) { 1427 1400 /* 1428 1401 * if we find an error in a super block, we just report it. 1429 1402 * They will get written with the next transaction commit ··· 1432 1405 spin_lock(&sdev->stat_lock); 1433 1406 ++sdev->stat.super_errors; 1434 1407 spin_unlock(&sdev->stat_lock); 1408 + if (fail_cor) 1409 + btrfs_dev_stat_inc_and_print(sdev->dev, 1410 + BTRFS_DEV_STAT_CORRUPTION_ERRS); 1411 + else 1412 + btrfs_dev_stat_inc_and_print(sdev->dev, 1413 + BTRFS_DEV_STAT_GENERATION_ERRS); 1435 1414 } 1436 1415 1437 - return fail; 1416 + return fail_cor + fail_gen; 1438 1417 } 1439 1418 1440 1419 static void scrub_block_get(struct scrub_block *sblock) ··· 1584 1551 return -ENOMEM; 1585 1552 } 1586 1553 spage->sblock = sblock; 1587 - spage->bdev = sdev->dev->bdev; 1554 + spage->dev = sdev->dev; 1588 1555 spage->flags = flags; 1589 1556 spage->generation = gen; 1590 1557 spage->logical = logical;
+69 -48
fs/btrfs/super.c
··· 188 188 va_start(args, fmt); 189 189 190 190 if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { 191 - strncpy(lvl, fmt, 3); 191 + memcpy(lvl, fmt, 3); 192 + lvl[3] = '\0'; 192 193 fmt += 3; 193 194 type = logtypes[fmt[1] - '0']; 194 195 } else ··· 436 435 case Opt_thread_pool: 437 436 intarg = 0; 438 437 match_int(&args[0], &intarg); 439 - if (intarg) { 438 + if (intarg) 440 439 info->thread_pool_size = intarg; 441 - printk(KERN_INFO "btrfs: thread pool %d\n", 442 - info->thread_pool_size); 443 - } 444 440 break; 445 441 case Opt_max_inline: 446 442 num = match_strdup(&args[0]); ··· 767 769 #ifdef CONFIG_BTRFS_FS_POSIX_ACL 768 770 sb->s_flags |= MS_POSIXACL; 769 771 #endif 770 - 772 + sb->s_flags |= MS_I_VERSION; 771 773 err = open_ctree(sb, fs_devices, (char *)data); 772 774 if (err) { 773 775 printk("btrfs: open_ctree failed\n"); ··· 923 925 */ 924 926 static char *setup_root_args(char *args) 925 927 { 926 - unsigned copied = 0; 927 - unsigned len = strlen(args) + 2; 928 - char *pos; 929 - char *ret; 928 + unsigned len = strlen(args) + 2 + 1; 929 + char *src, *dst, *buf; 930 930 931 931 /* 932 - * We need the same args as before, but minus 932 + * We need the same args as before, but with this substitution: 933 + * s!subvol=[^,]+!subvolid=0! 933 934 * 934 - * subvol=a 935 - * 936 - * and add 937 - * 938 - * subvolid=0 939 - * 940 - * which is a difference of 2 characters, so we allocate strlen(args) + 941 - * 2 characters. 935 + * Since the replacement string is up to 2 bytes longer than the 936 + * original, allocate strlen(args) + 2 + 1 bytes. 942 937 */ 943 - ret = kzalloc(len * sizeof(char), GFP_NOFS); 944 - if (!ret) 945 - return NULL; 946 - pos = strstr(args, "subvol="); 947 938 939 + src = strstr(args, "subvol="); 948 940 /* This shouldn't happen, but just in case.. */ 949 - if (!pos) { 950 - kfree(ret); 941 + if (!src) 951 942 return NULL; 952 - } 943 + 944 + buf = dst = kmalloc(len, GFP_NOFS); 945 + if (!buf) 946 + return NULL; 953 947 954 948 /* 955 - * The subvol=<> arg is not at the front of the string, copy everybody 956 - * up to that into ret. 949 + * If the subvol= arg is not at the start of the string, 950 + * copy whatever precedes it into buf. 957 951 */ 958 - if (pos != args) { 959 - *pos = '\0'; 960 - strcpy(ret, args); 961 - copied += strlen(args); 962 - pos++; 952 + if (src != args) { 953 + *src++ = '\0'; 954 + strcpy(buf, args); 955 + dst += strlen(args); 963 956 } 964 957 965 - strncpy(ret + copied, "subvolid=0", len - copied); 966 - 967 - /* Length of subvolid=0 */ 968 - copied += 10; 958 + strcpy(dst, "subvolid=0"); 959 + dst += strlen("subvolid=0"); 969 960 970 961 /* 971 - * If there is no , after the subvol= option then we know there's no 972 - * other options and we can just return. 962 + * If there is a "," after the original subvol=... string, 963 + * copy that suffix into our buffer. Otherwise, we're done. 973 964 */ 974 - pos = strchr(pos, ','); 975 - if (!pos) 976 - return ret; 965 + src = strchr(src, ','); 966 + if (src) 967 + strcpy(dst, src); 977 968 978 - /* Copy the rest of the arguments into our buffer */ 979 - strncpy(ret + copied, pos, len - copied); 980 - copied += strlen(pos); 981 - 982 - return ret; 969 + return buf; 983 970 } 984 971 985 972 static struct dentry *mount_subvol(const char *subvol_name, int flags, ··· 1101 1118 return ERR_PTR(error); 1102 1119 } 1103 1120 1121 + static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit) 1122 + { 1123 + spin_lock_irq(&workers->lock); 1124 + workers->max_workers = new_limit; 1125 + spin_unlock_irq(&workers->lock); 1126 + } 1127 + 1128 + static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, 1129 + int new_pool_size, int old_pool_size) 1130 + { 1131 + if (new_pool_size == old_pool_size) 1132 + return; 1133 + 1134 + fs_info->thread_pool_size = new_pool_size; 1135 + 1136 + printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", 1137 + old_pool_size, new_pool_size); 1138 + 1139 + btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1140 + btrfs_set_max_workers(&fs_info->workers, new_pool_size); 1141 + btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size); 1142 + btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size); 1143 + btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size); 1144 + btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size); 1145 + btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size); 1146 + btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size); 1147 + btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size); 1148 + btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size); 1149 + btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1150 + btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1151 + btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1152 + btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1153 + } 1154 + 1104 1155 static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1105 1156 { 1106 1157 struct btrfs_fs_info *fs_info = btrfs_sb(sb); ··· 1153 1136 ret = -EINVAL; 1154 1137 goto restore; 1155 1138 } 1139 + 1140 + btrfs_resize_thread_pool(fs_info, 1141 + fs_info->thread_pool_size, old_thread_pool_size); 1156 1142 1157 1143 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1158 1144 return 0; ··· 1200 1180 fs_info->compress_type = old_compress_type; 1201 1181 fs_info->max_inline = old_max_inline; 1202 1182 fs_info->alloc_start = old_alloc_start; 1203 - fs_info->thread_pool_size = old_thread_pool_size; 1183 + btrfs_resize_thread_pool(fs_info, 1184 + old_thread_pool_size, fs_info->thread_pool_size); 1204 1185 fs_info->metadata_ratio = old_metadata_ratio; 1205 1186 return ret; 1206 1187 }
+41 -18
fs/btrfs/transaction.c
··· 28 28 #include "locking.h" 29 29 #include "tree-log.h" 30 30 #include "inode-map.h" 31 + #include "volumes.h" 31 32 32 33 #define BTRFS_ROOT_TRANS_TAG 0 33 34 ··· 56 55 static noinline int join_transaction(struct btrfs_root *root, int nofail) 57 56 { 58 57 struct btrfs_transaction *cur_trans; 58 + struct btrfs_fs_info *fs_info = root->fs_info; 59 59 60 - spin_lock(&root->fs_info->trans_lock); 60 + spin_lock(&fs_info->trans_lock); 61 61 loop: 62 62 /* The file system has been taken offline. No new transactions. */ 63 - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 64 - spin_unlock(&root->fs_info->trans_lock); 63 + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 64 + spin_unlock(&fs_info->trans_lock); 65 65 return -EROFS; 66 66 } 67 67 68 - if (root->fs_info->trans_no_join) { 68 + if (fs_info->trans_no_join) { 69 69 if (!nofail) { 70 - spin_unlock(&root->fs_info->trans_lock); 70 + spin_unlock(&fs_info->trans_lock); 71 71 return -EBUSY; 72 72 } 73 73 } 74 74 75 - cur_trans = root->fs_info->running_transaction; 75 + cur_trans = fs_info->running_transaction; 76 76 if (cur_trans) { 77 77 if (cur_trans->aborted) { 78 - spin_unlock(&root->fs_info->trans_lock); 78 + spin_unlock(&fs_info->trans_lock); 79 79 return cur_trans->aborted; 80 80 } 81 81 atomic_inc(&cur_trans->use_count); 82 82 atomic_inc(&cur_trans->num_writers); 83 83 cur_trans->num_joined++; 84 - spin_unlock(&root->fs_info->trans_lock); 84 + spin_unlock(&fs_info->trans_lock); 85 85 return 0; 86 86 } 87 - spin_unlock(&root->fs_info->trans_lock); 87 + spin_unlock(&fs_info->trans_lock); 88 88 89 89 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 90 90 if (!cur_trans) 91 91 return -ENOMEM; 92 92 93 - spin_lock(&root->fs_info->trans_lock); 94 - if (root->fs_info->running_transaction) { 93 + spin_lock(&fs_info->trans_lock); 94 + if (fs_info->running_transaction) { 95 95 /* 96 96 * someone started a transaction after we unlocked. Make sure 97 97 * to redo the trans_no_join checks above 98 98 */ 99 99 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 100 - cur_trans = root->fs_info->running_transaction; 100 + cur_trans = fs_info->running_transaction; 101 101 goto loop; 102 102 } 103 103 ··· 123 121 cur_trans->delayed_refs.flushing = 0; 124 122 cur_trans->delayed_refs.run_delayed_start = 0; 125 123 cur_trans->delayed_refs.seq = 1; 124 + 125 + /* 126 + * although the tree mod log is per file system and not per transaction, 127 + * the log must never go across transaction boundaries. 128 + */ 129 + smp_mb(); 130 + if (!list_empty(&fs_info->tree_mod_seq_list)) { 131 + printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 132 + "creating a fresh transaction\n"); 133 + WARN_ON(1); 134 + } 135 + if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) { 136 + printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 137 + "creating a fresh transaction\n"); 138 + WARN_ON(1); 139 + } 140 + atomic_set(&fs_info->tree_mod_seq, 0); 141 + 126 142 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); 127 143 spin_lock_init(&cur_trans->commit_lock); 128 144 spin_lock_init(&cur_trans->delayed_refs.lock); 129 145 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); 130 146 131 147 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 132 - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 148 + list_add_tail(&cur_trans->list, &fs_info->trans_list); 133 149 extent_io_tree_init(&cur_trans->dirty_pages, 134 - root->fs_info->btree_inode->i_mapping); 135 - root->fs_info->generation++; 136 - cur_trans->transid = root->fs_info->generation; 137 - root->fs_info->running_transaction = cur_trans; 150 + fs_info->btree_inode->i_mapping); 151 + fs_info->generation++; 152 + cur_trans->transid = fs_info->generation; 153 + fs_info->running_transaction = cur_trans; 138 154 cur_trans->aborted = 0; 139 - spin_unlock(&root->fs_info->trans_lock); 155 + spin_unlock(&fs_info->trans_lock); 140 156 141 157 return 0; 142 158 } ··· 777 757 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 778 758 if (ret) 779 759 return ret; 760 + 761 + ret = btrfs_run_dev_stats(trans, root->fs_info); 762 + BUG_ON(ret); 780 763 781 764 while (!list_empty(&fs_info->dirty_cowonly_roots)) { 782 765 next = fs_info->dirty_cowonly_roots.next;
+16 -19
fs/btrfs/tree-log.c
··· 1628 1628 int i; 1629 1629 int ret; 1630 1630 1631 - btrfs_read_buffer(eb, gen); 1631 + ret = btrfs_read_buffer(eb, gen); 1632 + if (ret) 1633 + return ret; 1632 1634 1633 1635 level = btrfs_header_level(eb); 1634 1636 ··· 1751 1749 1752 1750 path->slots[*level]++; 1753 1751 if (wc->free) { 1754 - btrfs_read_buffer(next, ptr_gen); 1752 + ret = btrfs_read_buffer(next, ptr_gen); 1753 + if (ret) { 1754 + free_extent_buffer(next); 1755 + return ret; 1756 + } 1755 1757 1756 1758 btrfs_tree_lock(next); 1757 1759 btrfs_set_lock_blocking(next); ··· 1772 1766 free_extent_buffer(next); 1773 1767 continue; 1774 1768 } 1775 - btrfs_read_buffer(next, ptr_gen); 1769 + ret = btrfs_read_buffer(next, ptr_gen); 1770 + if (ret) { 1771 + free_extent_buffer(next); 1772 + return ret; 1773 + } 1776 1774 1777 1775 WARN_ON(*level <= 0); 1778 1776 if (path->nodes[*level-1]) ··· 2667 2657 btrfs_release_path(path); 2668 2658 } 2669 2659 btrfs_release_path(path); 2660 + if (ret > 0) 2661 + ret = 0; 2670 2662 return ret; 2671 2663 } 2672 2664 ··· 3040 3028 return ret; 3041 3029 } 3042 3030 3043 - static int inode_in_log(struct btrfs_trans_handle *trans, 3044 - struct inode *inode) 3045 - { 3046 - struct btrfs_root *root = BTRFS_I(inode)->root; 3047 - int ret = 0; 3048 - 3049 - mutex_lock(&root->log_mutex); 3050 - if (BTRFS_I(inode)->logged_trans == trans->transid && 3051 - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 3052 - ret = 1; 3053 - mutex_unlock(&root->log_mutex); 3054 - return ret; 3055 - } 3056 - 3057 - 3058 3031 /* 3059 3032 * helper function around btrfs_log_inode to make sure newly created 3060 3033 * parent directories also end up in the log. A minimal inode and backref ··· 3080 3083 if (ret) 3081 3084 goto end_no_trans; 3082 3085 3083 - if (inode_in_log(trans, inode)) { 3086 + if (btrfs_inode_in_log(inode, trans->transid)) { 3084 3087 ret = BTRFS_NO_LOG_SYNC; 3085 3088 goto end_no_trans; 3086 3089 }
+20 -18
fs/btrfs/ulist.c
··· 23 23 * 24 24 * ulist = ulist_alloc(); 25 25 * ulist_add(ulist, root); 26 - * elem = NULL; 26 + * ULIST_ITER_INIT(&uiter); 27 27 * 28 - * while ((elem = ulist_next(ulist, elem)) { 28 + * while ((elem = ulist_next(ulist, &uiter)) { 29 29 * for (all child nodes n in elem) 30 30 * ulist_add(ulist, n); 31 31 * do something useful with the node; ··· 95 95 * 96 96 * The allocated ulist will be returned in an initialized state. 97 97 */ 98 - struct ulist *ulist_alloc(unsigned long gfp_mask) 98 + struct ulist *ulist_alloc(gfp_t gfp_mask) 99 99 { 100 100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); 101 101 ··· 144 144 * unaltered. 145 145 */ 146 146 int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 147 - unsigned long gfp_mask) 147 + gfp_t gfp_mask) 148 + { 149 + return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); 150 + } 151 + 152 + int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 153 + unsigned long *old_aux, gfp_t gfp_mask) 148 154 { 149 155 int i; 150 156 151 157 for (i = 0; i < ulist->nnodes; ++i) { 152 - if (ulist->nodes[i].val == val) 158 + if (ulist->nodes[i].val == val) { 159 + if (old_aux) 160 + *old_aux = ulist->nodes[i].aux; 153 161 return 0; 162 + } 154 163 } 155 164 156 165 if (ulist->nnodes >= ulist->nodes_alloced) { ··· 197 188 /** 198 189 * ulist_next - iterate ulist 199 190 * @ulist: ulist to iterate 200 - * @prev: previously returned element or %NULL to start iteration 191 + * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) 201 192 * 202 193 * Note: locking must be provided by the caller. In case of rwlocks only read 203 194 * locking is needed 204 195 * 205 - * This function is used to iterate an ulist. The iteration is started with 206 - * @prev = %NULL. It returns the next element from the ulist or %NULL when the 196 + * This function is used to iterate an ulist. 197 + * It returns the next element from the ulist or %NULL when the 207 198 * end is reached. No guarantee is made with respect to the order in which 208 199 * the elements are returned. They might neither be returned in order of 209 200 * addition nor in ascending order. 210 201 * It is allowed to call ulist_add during an enumeration. Newly added items 211 202 * are guaranteed to show up in the running enumeration. 212 203 */ 213 - struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) 204 + struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) 214 205 { 215 - int next; 216 - 217 206 if (ulist->nnodes == 0) 218 207 return NULL; 219 - 220 - if (!prev) 221 - return &ulist->nodes[0]; 222 - 223 - next = (prev - ulist->nodes) + 1; 224 - if (next < 0 || next >= ulist->nnodes) 208 + if (uiter->i < 0 || uiter->i >= ulist->nnodes) 225 209 return NULL; 226 210 227 - return &ulist->nodes[next]; 211 + return &ulist->nodes[uiter->i++]; 228 212 } 229 213 EXPORT_SYMBOL(ulist_next);
+12 -3
fs/btrfs/ulist.h
··· 24 24 */ 25 25 #define ULIST_SIZE 16 26 26 27 + struct ulist_iterator { 28 + int i; 29 + }; 30 + 27 31 /* 28 32 * element of the list 29 33 */ ··· 63 59 void ulist_init(struct ulist *ulist); 64 60 void ulist_fini(struct ulist *ulist); 65 61 void ulist_reinit(struct ulist *ulist); 66 - struct ulist *ulist_alloc(unsigned long gfp_mask); 62 + struct ulist *ulist_alloc(gfp_t gfp_mask); 67 63 void ulist_free(struct ulist *ulist); 68 64 int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, 69 - unsigned long gfp_mask); 70 - struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); 65 + gfp_t gfp_mask); 66 + int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux, 67 + unsigned long *old_aux, gfp_t gfp_mask); 68 + struct ulist_node *ulist_next(struct ulist *ulist, 69 + struct ulist_iterator *uiter); 70 + 71 + #define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) 71 72 72 73 #endif
+303 -3
fs/btrfs/volumes.c
··· 23 23 #include <linux/random.h> 24 24 #include <linux/iocontext.h> 25 25 #include <linux/capability.h> 26 + #include <linux/ratelimit.h> 26 27 #include <linux/kthread.h> 27 28 #include <asm/div64.h> 28 29 #include "compat.h" ··· 40 39 struct btrfs_root *root, 41 40 struct btrfs_device *device); 42 41 static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 42 + static void __btrfs_reset_dev_stats(struct btrfs_device *dev); 43 + static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 43 44 44 45 static DEFINE_MUTEX(uuid_mutex); 45 46 static LIST_HEAD(fs_uuids); ··· 364 361 return -ENOMEM; 365 362 } 366 363 device->devid = devid; 364 + device->dev_stats_valid = 0; 367 365 device->work.func = pending_bios_fn; 368 366 memcpy(device->uuid, disk_super->dev_item.uuid, 369 367 BTRFS_UUID_SIZE); ··· 1637 1633 int ret = 0; 1638 1634 1639 1635 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1640 - return -EINVAL; 1636 + return -EROFS; 1641 1637 1642 1638 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 1643 1639 root->fs_info->bdev_holder); ··· 4005 4001 return 0; 4006 4002 } 4007 4003 4004 + static void *merge_stripe_index_into_bio_private(void *bi_private, 4005 + unsigned int stripe_index) 4006 + { 4007 + /* 4008 + * with single, dup, RAID0, RAID1 and RAID10, stripe_index is 4009 + * at most 1. 4010 + * The alternative solution (instead of stealing bits from the 4011 + * pointer) would be to allocate an intermediate structure 4012 + * that contains the old private pointer plus the stripe_index. 4013 + */ 4014 + BUG_ON((((uintptr_t)bi_private) & 3) != 0); 4015 + BUG_ON(stripe_index > 3); 4016 + return (void *)(((uintptr_t)bi_private) | stripe_index); 4017 + } 4018 + 4019 + static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private) 4020 + { 4021 + return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3)); 4022 + } 4023 + 4024 + static unsigned int extract_stripe_index_from_bio_private(void *bi_private) 4025 + { 4026 + return (unsigned int)((uintptr_t)bi_private) & 3; 4027 + } 4028 + 4008 4029 static void btrfs_end_bio(struct bio *bio, int err) 4009 4030 { 4010 - struct btrfs_bio *bbio = bio->bi_private; 4031 + struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private); 4011 4032 int is_orig_bio = 0; 4012 4033 4013 - if (err) 4034 + if (err) { 4014 4035 atomic_inc(&bbio->error); 4036 + if (err == -EIO || err == -EREMOTEIO) { 4037 + unsigned int stripe_index = 4038 + extract_stripe_index_from_bio_private( 4039 + bio->bi_private); 4040 + struct btrfs_device *dev; 4041 + 4042 + BUG_ON(stripe_index >= bbio->num_stripes); 4043 + dev = bbio->stripes[stripe_index].dev; 4044 + if (bio->bi_rw & WRITE) 4045 + btrfs_dev_stat_inc(dev, 4046 + BTRFS_DEV_STAT_WRITE_ERRS); 4047 + else 4048 + btrfs_dev_stat_inc(dev, 4049 + BTRFS_DEV_STAT_READ_ERRS); 4050 + if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) 4051 + btrfs_dev_stat_inc(dev, 4052 + BTRFS_DEV_STAT_FLUSH_ERRS); 4053 + btrfs_dev_stat_print_on_error(dev); 4054 + } 4055 + } 4015 4056 4016 4057 if (bio == bbio->orig_bio) 4017 4058 is_orig_bio = 1; ··· 4198 4149 bio = first_bio; 4199 4150 } 4200 4151 bio->bi_private = bbio; 4152 + bio->bi_private = merge_stripe_index_into_bio_private( 4153 + bio->bi_private, (unsigned int)dev_nr); 4201 4154 bio->bi_end_io = btrfs_end_bio; 4202 4155 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; 4203 4156 dev = bbio->stripes[dev_nr].dev; ··· 4560 4509 return ret; 4561 4510 } 4562 4511 4512 + struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, 4513 + u64 logical, int mirror_num) 4514 + { 4515 + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4516 + int ret; 4517 + u64 map_length = 0; 4518 + struct btrfs_bio *bbio = NULL; 4519 + struct btrfs_device *device; 4520 + 4521 + BUG_ON(mirror_num == 0); 4522 + ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, 4523 + mirror_num); 4524 + if (ret) { 4525 + BUG_ON(bbio != NULL); 4526 + return NULL; 4527 + } 4528 + BUG_ON(mirror_num != bbio->mirror_num); 4529 + device = bbio->stripes[mirror_num - 1].dev; 4530 + kfree(bbio); 4531 + return device; 4532 + } 4533 + 4563 4534 int btrfs_read_chunk_tree(struct btrfs_root *root) 4564 4535 { 4565 4536 struct btrfs_path *path; ··· 4655 4582 4656 4583 btrfs_free_path(path); 4657 4584 return ret; 4585 + } 4586 + 4587 + static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 4588 + { 4589 + int i; 4590 + 4591 + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4592 + btrfs_dev_stat_reset(dev, i); 4593 + } 4594 + 4595 + int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 4596 + { 4597 + struct btrfs_key key; 4598 + struct btrfs_key found_key; 4599 + struct btrfs_root *dev_root = fs_info->dev_root; 4600 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 4601 + struct extent_buffer *eb; 4602 + int slot; 4603 + int ret = 0; 4604 + struct btrfs_device *device; 4605 + struct btrfs_path *path = NULL; 4606 + int i; 4607 + 4608 + path = btrfs_alloc_path(); 4609 + if (!path) { 4610 + ret = -ENOMEM; 4611 + goto out; 4612 + } 4613 + 4614 + mutex_lock(&fs_devices->device_list_mutex); 4615 + list_for_each_entry(device, &fs_devices->devices, dev_list) { 4616 + int item_size; 4617 + struct btrfs_dev_stats_item *ptr; 4618 + 4619 + key.objectid = 0; 4620 + key.type = BTRFS_DEV_STATS_KEY; 4621 + key.offset = device->devid; 4622 + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 4623 + if (ret) { 4624 + printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", 4625 + device->name, (unsigned long long)device->devid); 4626 + __btrfs_reset_dev_stats(device); 4627 + device->dev_stats_valid = 1; 4628 + btrfs_release_path(path); 4629 + continue; 4630 + } 4631 + slot = path->slots[0]; 4632 + eb = path->nodes[0]; 4633 + btrfs_item_key_to_cpu(eb, &found_key, slot); 4634 + item_size = btrfs_item_size_nr(eb, slot); 4635 + 4636 + ptr = btrfs_item_ptr(eb, slot, 4637 + struct btrfs_dev_stats_item); 4638 + 4639 + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4640 + if (item_size >= (1 + i) * sizeof(__le64)) 4641 + btrfs_dev_stat_set(device, i, 4642 + btrfs_dev_stats_value(eb, ptr, i)); 4643 + else 4644 + btrfs_dev_stat_reset(device, i); 4645 + } 4646 + 4647 + device->dev_stats_valid = 1; 4648 + btrfs_dev_stat_print_on_load(device); 4649 + btrfs_release_path(path); 4650 + } 4651 + mutex_unlock(&fs_devices->device_list_mutex); 4652 + 4653 + out: 4654 + btrfs_free_path(path); 4655 + return ret < 0 ? ret : 0; 4656 + } 4657 + 4658 + static int update_dev_stat_item(struct btrfs_trans_handle *trans, 4659 + struct btrfs_root *dev_root, 4660 + struct btrfs_device *device) 4661 + { 4662 + struct btrfs_path *path; 4663 + struct btrfs_key key; 4664 + struct extent_buffer *eb; 4665 + struct btrfs_dev_stats_item *ptr; 4666 + int ret; 4667 + int i; 4668 + 4669 + key.objectid = 0; 4670 + key.type = BTRFS_DEV_STATS_KEY; 4671 + key.offset = device->devid; 4672 + 4673 + path = btrfs_alloc_path(); 4674 + BUG_ON(!path); 4675 + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 4676 + if (ret < 0) { 4677 + printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", 4678 + ret, device->name); 4679 + goto out; 4680 + } 4681 + 4682 + if (ret == 0 && 4683 + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 4684 + /* need to delete old one and insert a new one */ 4685 + ret = btrfs_del_item(trans, dev_root, path); 4686 + if (ret != 0) { 4687 + printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", 4688 + device->name, ret); 4689 + goto out; 4690 + } 4691 + ret = 1; 4692 + } 4693 + 4694 + if (ret == 1) { 4695 + /* need to insert a new item */ 4696 + btrfs_release_path(path); 4697 + ret = btrfs_insert_empty_item(trans, dev_root, path, 4698 + &key, sizeof(*ptr)); 4699 + if (ret < 0) { 4700 + printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", 4701 + device->name, ret); 4702 + goto out; 4703 + } 4704 + } 4705 + 4706 + eb = path->nodes[0]; 4707 + ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 4708 + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4709 + btrfs_set_dev_stats_value(eb, ptr, i, 4710 + btrfs_dev_stat_read(device, i)); 4711 + btrfs_mark_buffer_dirty(eb); 4712 + 4713 + out: 4714 + btrfs_free_path(path); 4715 + return ret; 4716 + } 4717 + 4718 + /* 4719 + * called from commit_transaction. Writes all changed device stats to disk. 4720 + */ 4721 + int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 4722 + struct btrfs_fs_info *fs_info) 4723 + { 4724 + struct btrfs_root *dev_root = fs_info->dev_root; 4725 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 4726 + struct btrfs_device *device; 4727 + int ret = 0; 4728 + 4729 + mutex_lock(&fs_devices->device_list_mutex); 4730 + list_for_each_entry(device, &fs_devices->devices, dev_list) { 4731 + if (!device->dev_stats_valid || !device->dev_stats_dirty) 4732 + continue; 4733 + 4734 + ret = update_dev_stat_item(trans, dev_root, device); 4735 + if (!ret) 4736 + device->dev_stats_dirty = 0; 4737 + } 4738 + mutex_unlock(&fs_devices->device_list_mutex); 4739 + 4740 + return ret; 4741 + } 4742 + 4743 + void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 4744 + { 4745 + btrfs_dev_stat_inc(dev, index); 4746 + btrfs_dev_stat_print_on_error(dev); 4747 + } 4748 + 4749 + void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 4750 + { 4751 + if (!dev->dev_stats_valid) 4752 + return; 4753 + printk_ratelimited(KERN_ERR 4754 + "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4755 + dev->name, 4756 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4757 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 4758 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 4759 + btrfs_dev_stat_read(dev, 4760 + BTRFS_DEV_STAT_CORRUPTION_ERRS), 4761 + btrfs_dev_stat_read(dev, 4762 + BTRFS_DEV_STAT_GENERATION_ERRS)); 4763 + } 4764 + 4765 + static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 4766 + { 4767 + printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 4768 + dev->name, 4769 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 4770 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 4771 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 4772 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 4773 + btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 4774 + } 4775 + 4776 + int btrfs_get_dev_stats(struct btrfs_root *root, 4777 + struct btrfs_ioctl_get_dev_stats *stats, 4778 + int reset_after_read) 4779 + { 4780 + struct btrfs_device *dev; 4781 + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 4782 + int i; 4783 + 4784 + mutex_lock(&fs_devices->device_list_mutex); 4785 + dev = btrfs_find_device(root, stats->devid, NULL, NULL); 4786 + mutex_unlock(&fs_devices->device_list_mutex); 4787 + 4788 + if (!dev) { 4789 + printk(KERN_WARNING 4790 + "btrfs: get dev_stats failed, device not found\n"); 4791 + return -ENODEV; 4792 + } else if (!dev->dev_stats_valid) { 4793 + printk(KERN_WARNING 4794 + "btrfs: get dev_stats failed, not yet valid\n"); 4795 + return -ENODEV; 4796 + } else if (reset_after_read) { 4797 + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 4798 + if (stats->nr_items > i) 4799 + stats->values[i] = 4800 + btrfs_dev_stat_read_and_reset(dev, i); 4801 + else 4802 + btrfs_dev_stat_reset(dev, i); 4803 + } 4804 + } else { 4805 + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 4806 + if (stats->nr_items > i) 4807 + stats->values[i] = btrfs_dev_stat_read(dev, i); 4808 + } 4809 + if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 4810 + stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 4811 + return 0; 4658 4812 }
+52
fs/btrfs/volumes.h
··· 22 22 #include <linux/bio.h> 23 23 #include <linux/sort.h> 24 24 #include "async-thread.h" 25 + #include "ioctl.h" 25 26 26 27 #define BTRFS_STRIPE_LEN (64 * 1024) 27 28 ··· 107 106 struct completion flush_wait; 108 107 int nobarriers; 109 108 109 + /* disk I/O failure stats. For detailed description refer to 110 + * enum btrfs_dev_stat_values in ioctl.h */ 111 + int dev_stats_valid; 112 + int dev_stats_dirty; /* counters need to be written to disk */ 113 + atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; 110 114 }; 111 115 112 116 struct btrfs_fs_devices { ··· 287 281 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 288 282 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 289 283 u64 *start, u64 *max_avail); 284 + struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, 285 + u64 logical, int mirror_num); 286 + void btrfs_dev_stat_print_on_error(struct btrfs_device *device); 287 + void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 288 + int btrfs_get_dev_stats(struct btrfs_root *root, 289 + struct btrfs_ioctl_get_dev_stats *stats, 290 + int reset_after_read); 291 + int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 292 + int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 293 + struct btrfs_fs_info *fs_info); 294 + 295 + static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 296 + int index) 297 + { 298 + atomic_inc(dev->dev_stat_values + index); 299 + dev->dev_stats_dirty = 1; 300 + } 301 + 302 + static inline int btrfs_dev_stat_read(struct btrfs_device *dev, 303 + int index) 304 + { 305 + return atomic_read(dev->dev_stat_values + index); 306 + } 307 + 308 + static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, 309 + int index) 310 + { 311 + int ret; 312 + 313 + ret = atomic_xchg(dev->dev_stat_values + index, 0); 314 + dev->dev_stats_dirty = 1; 315 + return ret; 316 + } 317 + 318 + static inline void btrfs_dev_stat_set(struct btrfs_device *dev, 319 + int index, unsigned long val) 320 + { 321 + atomic_set(dev->dev_stat_values + index, val); 322 + dev->dev_stats_dirty = 1; 323 + } 324 + 325 + static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, 326 + int index) 327 + { 328 + btrfs_dev_stat_set(dev, index, 0); 329 + } 290 330 #endif
+1
fs/btrfs/xattr.c
··· 196 196 if (ret) 197 197 goto out; 198 198 199 + inode_inc_iversion(inode); 199 200 inode->i_ctime = CURRENT_TIME; 200 201 ret = btrfs_update_inode(trans, root, inode); 201 202 BUG_ON(ret);