Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
"Fix a number of regression and other bugs in ext4, most of which were
relatively obscure cornercases or races that were found using
regression tests."

* tag 'ext4_for_linue' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits)
ext4: fix data=journal fast mount/umount hang
ext4: fix ext4_evict_inode() racing against workqueue processing code
ext4: fix memory leakage in mext_check_coverage
ext4: use s_extent_max_zeroout_kb value as number of kb
ext4: use atomic64_t for the per-flexbg free_clusters count
jbd2: fix use after free in jbd2_journal_dirty_metadata()
ext4: reserve metadata block for every delayed write
ext4: update reserved space after the 'correction'
ext4: do not use yield()
ext4: remove unused variable in ext4_free_blocks()
ext4: fix WARN_ON from ext4_releasepage()
ext4: fix the wrong number of the allocated blocks in ext4_split_extent()
ext4: update extent status tree after an extent is zeroed out
ext4: fix wrong m_len value after unwritten extent conversion
ext4: add self-testing infrastructure to do a sanity check
ext4: avoid a potential overflow in ext4_es_can_be_merged()
ext4: invalidate extent status tree during extent migration
ext4: remove unnecessary wait for extent conversion in ext4_fallocate()
ext4: add warning to ext4_convert_unwritten_extents_endio
ext4: disable merging of uninitialized extents
...

+541 -82
+4 -4
fs/ext4/ext4.h
··· 335 335 */ 336 336 337 337 struct flex_groups { 338 - atomic_t free_inodes; 339 - atomic_t free_clusters; 340 - atomic_t used_dirs; 338 + atomic64_t free_clusters; 339 + atomic_t free_inodes; 340 + atomic_t used_dirs; 341 341 }; 342 342 343 343 #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ ··· 2617 2617 extern int __init ext4_init_pageio(void); 2618 2618 extern void ext4_add_complete_io(ext4_io_end_t *io_end); 2619 2619 extern void ext4_exit_pageio(void); 2620 - extern void ext4_ioend_wait(struct inode *); 2620 + extern void ext4_ioend_shutdown(struct inode *); 2621 2621 extern void ext4_free_io_end(ext4_io_end_t *io); 2622 2622 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2623 2623 extern void ext4_end_io_work(struct work_struct *work);
+84 -21
fs/ext4/extents.c
··· 1584 1584 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1585 1585 1586 1586 /* 1587 - * Make sure that either both extents are uninitialized, or 1588 - * both are _not_. 1587 + * Make sure that both extents are initialized. We don't merge 1588 + * uninitialized extents so that we can be sure that end_io code has 1589 + * the extent that was written properly split out and conversion to 1590 + * initialized is trivial. 1589 1591 */ 1590 - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1592 + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) 1591 1593 return 0; 1592 1594 1593 1595 if (ext4_ext_is_uninitialized(ex1)) ··· 2925 2923 { 2926 2924 ext4_fsblk_t newblock; 2927 2925 ext4_lblk_t ee_block; 2928 - struct ext4_extent *ex, newex, orig_ex; 2926 + struct ext4_extent *ex, newex, orig_ex, zero_ex; 2929 2927 struct ext4_extent *ex2 = NULL; 2930 2928 unsigned int ee_len, depth; 2931 2929 int err = 0; ··· 2945 2943 newblock = split - ee_block + ext4_ext_pblock(ex); 2946 2944 2947 2945 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 2946 + BUG_ON(!ext4_ext_is_uninitialized(ex) && 2947 + split_flag & (EXT4_EXT_MAY_ZEROOUT | 2948 + EXT4_EXT_MARK_UNINIT1 | 2949 + EXT4_EXT_MARK_UNINIT2)); 2948 2950 2949 2951 err = ext4_ext_get_access(handle, inode, path + depth); 2950 2952 if (err) ··· 2996 2990 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 2997 2991 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 2998 2992 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 2999 - if (split_flag & EXT4_EXT_DATA_VALID1) 2993 + if (split_flag & EXT4_EXT_DATA_VALID1) { 3000 2994 err = ext4_ext_zeroout(inode, ex2); 3001 - else 2995 + zero_ex.ee_block = ex2->ee_block; 2996 + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); 2997 + ext4_ext_store_pblock(&zero_ex, 2998 + ext4_ext_pblock(ex2)); 2999 + } else { 3002 3000 err = ext4_ext_zeroout(inode, ex); 3003 - } else 3001 + zero_ex.ee_block = ex->ee_block; 3002 + zero_ex.ee_len = ext4_ext_get_actual_len(ex); 3003 + ext4_ext_store_pblock(&zero_ex, 3004 + ext4_ext_pblock(ex)); 3005 + } 3006 + } else { 3004 3007 err = ext4_ext_zeroout(inode, &orig_ex); 3008 + zero_ex.ee_block = orig_ex.ee_block; 3009 + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); 3010 + ext4_ext_store_pblock(&zero_ex, 3011 + ext4_ext_pblock(&orig_ex)); 3012 + } 3005 3013 3006 3014 if (err) 3007 3015 goto fix_extent_len; ··· 3023 3003 ex->ee_len = cpu_to_le16(ee_len); 3024 3004 ext4_ext_try_to_merge(handle, inode, path, ex); 3025 3005 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3006 + if (err) 3007 + goto fix_extent_len; 3008 + 3009 + /* update extent status tree */ 3010 + err = ext4_es_zeroout(inode, &zero_ex); 3011 + 3026 3012 goto out; 3027 3013 } else if (err) 3028 3014 goto fix_extent_len; ··· 3067 3041 int err = 0; 3068 3042 int uninitialized; 3069 3043 int split_flag1, flags1; 3044 + int allocated = map->m_len; 3070 3045 3071 3046 depth = ext_depth(inode); 3072 3047 ex = path[depth].p_ext; ··· 3087 3060 map->m_lblk + map->m_len, split_flag1, flags1); 3088 3061 if (err) 3089 3062 goto out; 3063 + } else { 3064 + allocated = ee_len - (map->m_lblk - ee_block); 3090 3065 } 3091 - 3066 + /* 3067 + * Update path is required because previous ext4_split_extent_at() may 3068 + * result in split of original leaf or extent zeroout. 3069 + */ 3092 3070 ext4_ext_drop_refs(path); 3093 3071 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3094 3072 if (IS_ERR(path)) 3095 3073 return PTR_ERR(path); 3074 + depth = ext_depth(inode); 3075 + ex = path[depth].p_ext; 3076 + uninitialized = ext4_ext_is_uninitialized(ex); 3077 + split_flag1 = 0; 3096 3078 3097 3079 if (map->m_lblk >= ee_block) { 3098 - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | 3099 - EXT4_EXT_DATA_VALID2); 3100 - if (uninitialized) 3080 + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; 3081 + if (uninitialized) { 3101 3082 split_flag1 |= EXT4_EXT_MARK_UNINIT1; 3102 - if (split_flag & EXT4_EXT_MARK_UNINIT2) 3103 - split_flag1 |= EXT4_EXT_MARK_UNINIT2; 3083 + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3084 + EXT4_EXT_MARK_UNINIT2); 3085 + } 3104 3086 err = ext4_split_extent_at(handle, inode, path, 3105 3087 map->m_lblk, split_flag1, flags); 3106 3088 if (err) ··· 3118 3082 3119 3083 ext4_ext_show_leaf(inode, path); 3120 3084 out: 3121 - return err ? err : map->m_len; 3085 + return err ? err : allocated; 3122 3086 } 3123 3087 3124 3088 /* ··· 3173 3137 ee_block = le32_to_cpu(ex->ee_block); 3174 3138 ee_len = ext4_ext_get_actual_len(ex); 3175 3139 allocated = ee_len - (map->m_lblk - ee_block); 3140 + zero_ex.ee_len = 0; 3176 3141 3177 3142 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3178 3143 ··· 3264 3227 3265 3228 if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3266 3229 max_zeroout = sbi->s_extent_max_zeroout_kb >> 3267 - inode->i_sb->s_blocksize_bits; 3230 + (inode->i_sb->s_blocksize_bits - 10); 3268 3231 3269 3232 /* If extent is less than s_max_zeroout_kb, zeroout directly */ 3270 3233 if (max_zeroout && (ee_len <= max_zeroout)) { 3271 3234 err = ext4_ext_zeroout(inode, ex); 3272 3235 if (err) 3273 3236 goto out; 3237 + zero_ex.ee_block = ex->ee_block; 3238 + zero_ex.ee_len = ext4_ext_get_actual_len(ex); 3239 + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); 3274 3240 3275 3241 err = ext4_ext_get_access(handle, inode, path + depth); 3276 3242 if (err) ··· 3332 3292 err = allocated; 3333 3293 3334 3294 out: 3295 + /* If we have gotten a failure, don't zero out status tree */ 3296 + if (!err) 3297 + err = ext4_es_zeroout(inode, &zero_ex); 3335 3298 return err ? err : allocated; 3336 3299 } 3337 3300 ··· 3417 3374 "block %llu, max_blocks %u\n", inode->i_ino, 3418 3375 (unsigned long long)ee_block, ee_len); 3419 3376 3420 - /* If extent is larger than requested then split is required */ 3377 + /* If extent is larger than requested it is a clear sign that we still 3378 + * have some extent state machine issues left. So extent_split is still 3379 + * required. 3380 + * TODO: Once all related issues will be fixed this situation should be 3381 + * illegal. 3382 + */ 3421 3383 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3384 + #ifdef EXT4_DEBUG 3385 + ext4_warning("Inode (%ld) finished: extent logical block %llu," 3386 + " len %u; IO logical block %llu, len %u\n", 3387 + inode->i_ino, (unsigned long long)ee_block, ee_len, 3388 + (unsigned long long)map->m_lblk, map->m_len); 3389 + #endif 3422 3390 err = ext4_split_unwritten_extents(handle, inode, map, path, 3423 3391 EXT4_GET_BLOCKS_CONVERT); 3424 3392 if (err < 0) ··· 3680 3626 path, map->m_len); 3681 3627 } else 3682 3628 err = ret; 3629 + map->m_flags |= EXT4_MAP_MAPPED; 3630 + if (allocated > map->m_len) 3631 + allocated = map->m_len; 3632 + map->m_len = allocated; 3683 3633 goto out2; 3684 3634 } 3685 3635 /* buffered IO case */ ··· 3733 3675 allocated - map->m_len); 3734 3676 allocated = map->m_len; 3735 3677 } 3678 + map->m_len = allocated; 3736 3679 3737 3680 /* 3738 3681 * If we have done fallocate with the offset that is already ··· 4165 4106 } 4166 4107 } else { 4167 4108 BUG_ON(allocated_clusters < reserved_clusters); 4168 - /* We will claim quota for all newly allocated blocks.*/ 4169 - ext4_da_update_reserve_space(inode, allocated_clusters, 4170 - 1); 4171 4109 if (reserved_clusters < allocated_clusters) { 4172 4110 struct ext4_inode_info *ei = EXT4_I(inode); 4173 4111 int reservation = allocated_clusters - ··· 4215 4159 ei->i_reserved_data_blocks += reservation; 4216 4160 spin_unlock(&ei->i_block_reservation_lock); 4217 4161 } 4162 + /* 4163 + * We will claim quota for all newly allocated blocks. 4164 + * We're updating the reserved space *after* the 4165 + * correction above so we do not accidentally free 4166 + * all the metadata reservation because we might 4167 + * actually need it later on. 4168 + */ 4169 + ext4_da_update_reserve_space(inode, allocated_clusters, 4170 + 1); 4218 4171 } 4219 4172 } 4220 4173 ··· 4433 4368 if (len <= EXT_UNINIT_MAX_LEN << blkbits) 4434 4369 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4435 4370 4436 - /* Prevent race condition between unwritten */ 4437 - ext4_flush_unwritten_io(inode); 4438 4371 retry: 4439 4372 while (ret >= 0 && ret < max_blocks) { 4440 4373 map.m_lblk = map.m_lblk + ret;
+208 -6
fs/ext4/extents_status.c
··· 333 333 static int ext4_es_can_be_merged(struct extent_status *es1, 334 334 struct extent_status *es2) 335 335 { 336 - if (es1->es_lblk + es1->es_len != es2->es_lblk) 337 - return 0; 338 - 339 336 if (ext4_es_status(es1) != ext4_es_status(es2)) 340 337 return 0; 341 338 342 - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && 343 - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) 339 + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) 344 340 return 0; 345 341 346 - return 1; 342 + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) 343 + return 0; 344 + 345 + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && 346 + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) 347 + return 1; 348 + 349 + if (ext4_es_is_hole(es1)) 350 + return 1; 351 + 352 + /* we need to check delayed extent is without unwritten status */ 353 + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) 354 + return 1; 355 + 356 + return 0; 347 357 } 348 358 349 359 static struct extent_status * ··· 398 388 399 389 return es; 400 390 } 391 + 392 + #ifdef ES_AGGRESSIVE_TEST 393 + static void ext4_es_insert_extent_ext_check(struct inode *inode, 394 + struct extent_status *es) 395 + { 396 + struct ext4_ext_path *path = NULL; 397 + struct ext4_extent *ex; 398 + ext4_lblk_t ee_block; 399 + ext4_fsblk_t ee_start; 400 + unsigned short ee_len; 401 + int depth, ee_status, es_status; 402 + 403 + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); 404 + if (IS_ERR(path)) 405 + return; 406 + 407 + depth = ext_depth(inode); 408 + ex = path[depth].p_ext; 409 + 410 + if (ex) { 411 + 412 + ee_block = le32_to_cpu(ex->ee_block); 413 + ee_start = ext4_ext_pblock(ex); 414 + ee_len = ext4_ext_get_actual_len(ex); 415 + 416 + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; 417 + es_status = ext4_es_is_unwritten(es) ? 1 : 0; 418 + 419 + /* 420 + * Make sure ex and es are not overlap when we try to insert 421 + * a delayed/hole extent. 422 + */ 423 + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { 424 + if (in_range(es->es_lblk, ee_block, ee_len)) { 425 + pr_warn("ES insert assertation failed for " 426 + "inode: %lu we can find an extent " 427 + "at block [%d/%d/%llu/%c], but we " 428 + "want to add an delayed/hole extent " 429 + "[%d/%d/%llu/%llx]\n", 430 + inode->i_ino, ee_block, ee_len, 431 + ee_start, ee_status ? 'u' : 'w', 432 + es->es_lblk, es->es_len, 433 + ext4_es_pblock(es), ext4_es_status(es)); 434 + } 435 + goto out; 436 + } 437 + 438 + /* 439 + * We don't check ee_block == es->es_lblk, etc. because es 440 + * might be a part of whole extent, vice versa. 441 + */ 442 + if (es->es_lblk < ee_block || 443 + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { 444 + pr_warn("ES insert assertation failed for inode: %lu " 445 + "ex_status [%d/%d/%llu/%c] != " 446 + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 447 + ee_block, ee_len, ee_start, 448 + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, 449 + ext4_es_pblock(es), es_status ? 'u' : 'w'); 450 + goto out; 451 + } 452 + 453 + if (ee_status ^ es_status) { 454 + pr_warn("ES insert assertation failed for inode: %lu " 455 + "ex_status [%d/%d/%llu/%c] != " 456 + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 457 + ee_block, ee_len, ee_start, 458 + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, 459 + ext4_es_pblock(es), es_status ? 'u' : 'w'); 460 + } 461 + } else { 462 + /* 463 + * We can't find an extent on disk. So we need to make sure 464 + * that we don't want to add an written/unwritten extent. 465 + */ 466 + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 467 + pr_warn("ES insert assertation failed for inode: %lu " 468 + "can't find an extent at block %d but we want " 469 + "to add an written/unwritten extent " 470 + "[%d/%d/%llu/%llx]\n", inode->i_ino, 471 + es->es_lblk, es->es_lblk, es->es_len, 472 + ext4_es_pblock(es), ext4_es_status(es)); 473 + } 474 + } 475 + out: 476 + if (path) { 477 + ext4_ext_drop_refs(path); 478 + kfree(path); 479 + } 480 + } 481 + 482 + static void ext4_es_insert_extent_ind_check(struct inode *inode, 483 + struct extent_status *es) 484 + { 485 + struct ext4_map_blocks map; 486 + int retval; 487 + 488 + /* 489 + * Here we call ext4_ind_map_blocks to lookup a block mapping because 490 + * 'Indirect' structure is defined in indirect.c. So we couldn't 491 + * access direct/indirect tree from outside. It is too dirty to define 492 + * this function in indirect.c file. 493 + */ 494 + 495 + map.m_lblk = es->es_lblk; 496 + map.m_len = es->es_len; 497 + 498 + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); 499 + if (retval > 0) { 500 + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { 501 + /* 502 + * We want to add a delayed/hole extent but this 503 + * block has been allocated. 504 + */ 505 + pr_warn("ES insert assertation failed for inode: %lu " 506 + "We can find blocks but we want to add a " 507 + "delayed/hole extent [%d/%d/%llu/%llx]\n", 508 + inode->i_ino, es->es_lblk, es->es_len, 509 + ext4_es_pblock(es), ext4_es_status(es)); 510 + return; 511 + } else if (ext4_es_is_written(es)) { 512 + if (retval != es->es_len) { 513 + pr_warn("ES insert assertation failed for " 514 + "inode: %lu retval %d != es_len %d\n", 515 + inode->i_ino, retval, es->es_len); 516 + return; 517 + } 518 + if (map.m_pblk != ext4_es_pblock(es)) { 519 + pr_warn("ES insert assertation failed for " 520 + "inode: %lu m_pblk %llu != " 521 + "es_pblk %llu\n", 522 + inode->i_ino, map.m_pblk, 523 + ext4_es_pblock(es)); 524 + return; 525 + } 526 + } else { 527 + /* 528 + * We don't need to check unwritten extent because 529 + * indirect-based file doesn't have it. 530 + */ 531 + BUG_ON(1); 532 + } 533 + } else if (retval == 0) { 534 + if (ext4_es_is_written(es)) { 535 + pr_warn("ES insert assertation failed for inode: %lu " 536 + "We can't find the block but we want to add " 537 + "an written extent [%d/%d/%llu/%llx]\n", 538 + inode->i_ino, es->es_lblk, es->es_len, 539 + ext4_es_pblock(es), ext4_es_status(es)); 540 + return; 541 + } 542 + } 543 + } 544 + 545 + static inline void ext4_es_insert_extent_check(struct inode *inode, 546 + struct extent_status *es) 547 + { 548 + /* 549 + * We don't need to worry about the race condition because 550 + * caller takes i_data_sem locking. 551 + */ 552 + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 553 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 554 + ext4_es_insert_extent_ext_check(inode, es); 555 + else 556 + ext4_es_insert_extent_ind_check(inode, es); 557 + } 558 + #else 559 + static inline void ext4_es_insert_extent_check(struct inode *inode, 560 + struct extent_status *es) 561 + { 562 + } 563 + #endif 401 564 402 565 static int __es_insert_extent(struct inode *inode, struct extent_status *newes) 403 566 { ··· 653 470 ext4_es_store_pblock(&newes, pblk); 654 471 ext4_es_store_status(&newes, status); 655 472 trace_ext4_es_insert_extent(inode, &newes); 473 + 474 + ext4_es_insert_extent_check(inode, &newes); 656 475 657 476 write_lock(&EXT4_I(inode)->i_es_lock); 658 477 err = __es_remove_extent(inode, lblk, end); ··· 852 667 write_unlock(&EXT4_I(inode)->i_es_lock); 853 668 ext4_es_print_tree(inode); 854 669 return err; 670 + } 671 + 672 + int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) 673 + { 674 + ext4_lblk_t ee_block; 675 + ext4_fsblk_t ee_pblock; 676 + unsigned int ee_len; 677 + 678 + ee_block = le32_to_cpu(ex->ee_block); 679 + ee_len = ext4_ext_get_actual_len(ex); 680 + ee_pblock = ext4_ext_pblock(ex); 681 + 682 + if (ee_len == 0) 683 + return 0; 684 + 685 + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, 686 + EXTENT_STATUS_WRITTEN); 855 687 } 856 688 857 689 static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+9
fs/ext4/extents_status.h
··· 21 21 #endif 22 22 23 23 /* 24 + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be 25 + * checked with old map_block's result. 26 + */ 27 + #define ES_AGGRESSIVE_TEST__ 28 + 29 + /* 24 30 * These flags live in the high bits of extent_status.es_pblk 25 31 */ 26 32 #define EXTENT_STATUS_WRITTEN (1ULL << 63) ··· 38 32 EXTENT_STATUS_UNWRITTEN | \ 39 33 EXTENT_STATUS_DELAYED | \ 40 34 EXTENT_STATUS_HOLE) 35 + 36 + struct ext4_extent; 41 37 42 38 struct extent_status { 43 39 struct rb_node rb_node; ··· 66 58 struct extent_status *es); 67 59 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 68 60 struct extent_status *es); 61 + extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); 69 62 70 63 static inline int ext4_es_is_written(struct extent_status *es) 71 64 {
+2 -2
fs/ext4/ialloc.c
··· 324 324 } 325 325 326 326 struct orlov_stats { 327 + __u64 free_clusters; 327 328 __u32 free_inodes; 328 - __u32 free_clusters; 329 329 __u32 used_dirs; 330 330 }; 331 331 ··· 342 342 343 343 if (flex_size > 1) { 344 344 stats->free_inodes = atomic_read(&flex_group[g].free_inodes); 345 - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); 345 + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); 346 346 stats->used_dirs = atomic_read(&flex_group[g].used_dirs); 347 347 return; 348 348 }
+174 -8
fs/ext4/inode.c
··· 185 185 186 186 trace_ext4_evict_inode(inode); 187 187 188 - ext4_ioend_wait(inode); 189 - 190 188 if (inode->i_nlink) { 191 189 /* 192 190 * When journalling data dirty buffers are tracked only in the ··· 205 207 * don't use page cache. 206 208 */ 207 209 if (ext4_should_journal_data(inode) && 208 - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { 210 + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && 211 + inode->i_ino != EXT4_JOURNAL_INO) { 209 212 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 210 213 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 211 214 ··· 215 216 filemap_write_and_wait(&inode->i_data); 216 217 } 217 218 truncate_inode_pages(&inode->i_data, 0); 219 + ext4_ioend_shutdown(inode); 218 220 goto no_delete; 219 221 } 220 222 ··· 225 225 if (ext4_should_order_data(inode)) 226 226 ext4_begin_ordered_truncate(inode, 0); 227 227 truncate_inode_pages(&inode->i_data, 0); 228 + ext4_ioend_shutdown(inode); 228 229 229 230 if (is_bad_inode(inode)) 230 231 goto no_delete; ··· 483 482 return num; 484 483 } 485 484 485 + #ifdef ES_AGGRESSIVE_TEST 486 + static void ext4_map_blocks_es_recheck(handle_t *handle, 487 + struct inode *inode, 488 + struct ext4_map_blocks *es_map, 489 + struct ext4_map_blocks *map, 490 + int flags) 491 + { 492 + int retval; 493 + 494 + map->m_flags = 0; 495 + /* 496 + * There is a race window that the result is not the same. 497 + * e.g. xfstests #223 when dioread_nolock enables. The reason 498 + * is that we lookup a block mapping in extent status tree with 499 + * out taking i_data_sem. So at the time the unwritten extent 500 + * could be converted. 501 + */ 502 + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 503 + down_read((&EXT4_I(inode)->i_data_sem)); 504 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 505 + retval = ext4_ext_map_blocks(handle, inode, map, flags & 506 + EXT4_GET_BLOCKS_KEEP_SIZE); 507 + } else { 508 + retval = ext4_ind_map_blocks(handle, inode, map, flags & 509 + EXT4_GET_BLOCKS_KEEP_SIZE); 510 + } 511 + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) 512 + up_read((&EXT4_I(inode)->i_data_sem)); 513 + /* 514 + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag 515 + * because it shouldn't be marked in es_map->m_flags. 516 + */ 517 + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); 518 + 519 + /* 520 + * We don't check m_len because extent will be collpased in status 521 + * tree. So the m_len might not equal. 522 + */ 523 + if (es_map->m_lblk != map->m_lblk || 524 + es_map->m_flags != map->m_flags || 525 + es_map->m_pblk != map->m_pblk) { 526 + printk("ES cache assertation failed for inode: %lu " 527 + "es_cached ex [%d/%d/%llu/%x] != " 528 + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 529 + inode->i_ino, es_map->m_lblk, es_map->m_len, 530 + es_map->m_pblk, es_map->m_flags, map->m_lblk, 531 + map->m_len, map->m_pblk, map->m_flags, 532 + retval, flags); 533 + } 534 + } 535 + #endif /* ES_AGGRESSIVE_TEST */ 536 + 486 537 /* 487 538 * The ext4_map_blocks() function tries to look up the requested blocks, 488 539 * and returns if the blocks are already mapped. ··· 562 509 { 563 510 struct extent_status es; 564 511 int retval; 512 + #ifdef ES_AGGRESSIVE_TEST 513 + struct ext4_map_blocks orig_map; 514 + 515 + memcpy(&orig_map, map, sizeof(*map)); 516 + #endif 565 517 566 518 map->m_flags = 0; 567 519 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," ··· 589 531 } else { 590 532 BUG_ON(1); 591 533 } 534 + #ifdef ES_AGGRESSIVE_TEST 535 + ext4_map_blocks_es_recheck(handle, inode, map, 536 + &orig_map, flags); 537 + #endif 592 538 goto found; 593 539 } 594 540 ··· 612 550 if (retval > 0) { 613 551 int ret; 614 552 unsigned long long status; 553 + 554 + #ifdef ES_AGGRESSIVE_TEST 555 + if (retval != map->m_len) { 556 + printk("ES len assertation failed for inode: %lu " 557 + "retval %d != map->m_len %d " 558 + "in %s (lookup)\n", inode->i_ino, retval, 559 + map->m_len, __func__); 560 + } 561 + #endif 615 562 616 563 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 617 564 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ··· 714 643 int ret; 715 644 unsigned long long status; 716 645 646 + #ifdef ES_AGGRESSIVE_TEST 647 + if (retval != map->m_len) { 648 + printk("ES len assertation failed for inode: %lu " 649 + "retval %d != map->m_len %d " 650 + "in %s (allocation)\n", inode->i_ino, retval, 651 + map->m_len, __func__); 652 + } 653 + #endif 654 + 655 + /* 656 + * If the extent has been zeroed out, we don't need to update 657 + * extent status tree. 658 + */ 659 + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && 660 + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 661 + if (ext4_es_is_written(&es)) 662 + goto has_zeroout; 663 + } 717 664 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 718 665 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 719 666 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && ··· 744 655 retval = ret; 745 656 } 746 657 658 + has_zeroout: 747 659 up_write((&EXT4_I(inode)->i_data_sem)); 748 660 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 749 661 int ret = check_block_validity(inode, map); ··· 1306 1216 } 1307 1217 1308 1218 /* 1219 + * Reserve a metadata for a single block located at lblock 1220 + */ 1221 + static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) 1222 + { 1223 + int retries = 0; 1224 + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1225 + struct ext4_inode_info *ei = EXT4_I(inode); 1226 + unsigned int md_needed; 1227 + ext4_lblk_t save_last_lblock; 1228 + int save_len; 1229 + 1230 + /* 1231 + * recalculate the amount of metadata blocks to reserve 1232 + * in order to allocate nrblocks 1233 + * worse case is one extent per block 1234 + */ 1235 + repeat: 1236 + spin_lock(&ei->i_block_reservation_lock); 1237 + /* 1238 + * ext4_calc_metadata_amount() has side effects, which we have 1239 + * to be prepared undo if we fail to claim space. 1240 + */ 1241 + save_len = ei->i_da_metadata_calc_len; 1242 + save_last_lblock = ei->i_da_metadata_calc_last_lblock; 1243 + md_needed = EXT4_NUM_B2C(sbi, 1244 + ext4_calc_metadata_amount(inode, lblock)); 1245 + trace_ext4_da_reserve_space(inode, md_needed); 1246 + 1247 + /* 1248 + * We do still charge estimated metadata to the sb though; 1249 + * we cannot afford to run out of free blocks. 1250 + */ 1251 + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { 1252 + ei->i_da_metadata_calc_len = save_len; 1253 + ei->i_da_metadata_calc_last_lblock = save_last_lblock; 1254 + spin_unlock(&ei->i_block_reservation_lock); 1255 + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1256 + cond_resched(); 1257 + goto repeat; 1258 + } 1259 + return -ENOSPC; 1260 + } 1261 + ei->i_reserved_meta_blocks += md_needed; 1262 + spin_unlock(&ei->i_block_reservation_lock); 1263 + 1264 + return 0; /* success */ 1265 + } 1266 + 1267 + /* 1309 1268 * Reserve a single cluster located at lblock 1310 1269 */ 1311 1270 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) ··· 1402 1263 ei->i_da_metadata_calc_last_lblock = save_last_lblock; 1403 1264 spin_unlock(&ei->i_block_reservation_lock); 1404 1265 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1405 - yield(); 1266 + cond_resched(); 1406 1267 goto repeat; 1407 1268 } 1408 1269 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); ··· 1907 1768 struct extent_status es; 1908 1769 int retval; 1909 1770 sector_t invalid_block = ~((sector_t) 0xffff); 1771 + #ifdef ES_AGGRESSIVE_TEST 1772 + struct ext4_map_blocks orig_map; 1773 + 1774 + memcpy(&orig_map, map, sizeof(*map)); 1775 + #endif 1910 1776 1911 1777 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 1912 1778 invalid_block = ~0; ··· 1953 1809 else 1954 1810 BUG_ON(1); 1955 1811 1812 + #ifdef ES_AGGRESSIVE_TEST 1813 + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); 1814 + #endif 1956 1815 return retval; 1957 1816 } 1958 1817 ··· 1990 1843 * XXX: __block_prepare_write() unmaps passed block, 1991 1844 * is it OK? 1992 1845 */ 1993 - /* If the block was allocated from previously allocated cluster, 1994 - * then we dont need to reserve it again. */ 1846 + /* 1847 + * If the block was allocated from previously allocated cluster, 1848 + * then we don't need to reserve it again. However we still need 1849 + * to reserve metadata for every block we're going to write. 1850 + */ 1995 1851 if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { 1996 1852 ret = ext4_da_reserve_space(inode, iblock); 1853 + if (ret) { 1854 + /* not enough space to reserve */ 1855 + retval = ret; 1856 + goto out_unlock; 1857 + } 1858 + } else { 1859 + ret = ext4_da_reserve_metadata(inode, iblock); 1997 1860 if (ret) { 1998 1861 /* not enough space to reserve */ 1999 1862 retval = ret; ··· 2029 1872 } else if (retval > 0) { 2030 1873 int ret; 2031 1874 unsigned long long status; 1875 + 1876 + #ifdef ES_AGGRESSIVE_TEST 1877 + if (retval != map->m_len) { 1878 + printk("ES len assertation failed for inode: %lu " 1879 + "retval %d != map->m_len %d " 1880 + "in %s (lookup)\n", inode->i_ino, retval, 1881 + map->m_len, __func__); 1882 + } 1883 + #endif 2032 1884 2033 1885 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 2034 1886 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ··· 3074 2908 3075 2909 trace_ext4_releasepage(page); 3076 2910 3077 - WARN_ON(PageChecked(page)); 3078 - if (!page_has_buffers(page)) 2911 + /* Page has dirty journalled data -> cannot release */ 2912 + if (PageChecked(page)) 3079 2913 return 0; 3080 2914 if (journal) 3081 2915 return jbd2_journal_try_to_free_buffers(journal, page, wait);
+8 -15
fs/ext4/mballoc.c
··· 2804 2804 if (sbi->s_log_groups_per_flex) { 2805 2805 ext4_group_t flex_group = ext4_flex_group(sbi, 2806 2806 ac->ac_b_ex.fe_group); 2807 - atomic_sub(ac->ac_b_ex.fe_len, 2808 - &sbi->s_flex_groups[flex_group].free_clusters); 2807 + atomic64_sub(ac->ac_b_ex.fe_len, 2808 + &sbi->s_flex_groups[flex_group].free_clusters); 2809 2809 } 2810 2810 2811 2811 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); ··· 3692 3692 if (free < needed && busy) { 3693 3693 busy = 0; 3694 3694 ext4_unlock_group(sb, group); 3695 - /* 3696 - * Yield the CPU here so that we don't get soft lockup 3697 - * in non preempt case. 3698 - */ 3699 - yield(); 3695 + cond_resched(); 3700 3696 goto repeat; 3701 3697 } 3702 3698 ··· 4242 4246 ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { 4243 4247 4244 4248 /* let others to free the space */ 4245 - yield(); 4249 + cond_resched(); 4246 4250 ar->len = ar->len >> 1; 4247 4251 } 4248 4252 if (!ar->len) { ··· 4460 4464 struct buffer_head *bitmap_bh = NULL; 4461 4465 struct super_block *sb = inode->i_sb; 4462 4466 struct ext4_group_desc *gdp; 4463 - unsigned long freed = 0; 4464 4467 unsigned int overflow; 4465 4468 ext4_grpblk_t bit; 4466 4469 struct buffer_head *gd_bh; ··· 4661 4666 4662 4667 if (sbi->s_log_groups_per_flex) { 4663 4668 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4664 - atomic_add(count_clusters, 4665 - &sbi->s_flex_groups[flex_group].free_clusters); 4669 + atomic64_add(count_clusters, 4670 + &sbi->s_flex_groups[flex_group].free_clusters); 4666 4671 } 4667 4672 4668 4673 ext4_mb_unload_buddy(&e4b); 4669 - 4670 - freed += count; 4671 4674 4672 4675 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 4673 4676 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); ··· 4804 4811 4805 4812 if (sbi->s_log_groups_per_flex) { 4806 4813 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4807 - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), 4808 - &sbi->s_flex_groups[flex_group].free_clusters); 4814 + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), 4815 + &sbi->s_flex_groups[flex_group].free_clusters); 4809 4816 } 4810 4817 4811 4818 ext4_mb_unload_buddy(&e4b);
+27 -16
fs/ext4/move_extent.c
··· 32 32 */ 33 33 static inline int 34 34 get_ext_path(struct inode *inode, ext4_lblk_t lblock, 35 - struct ext4_ext_path **path) 35 + struct ext4_ext_path **orig_path) 36 36 { 37 37 int ret = 0; 38 + struct ext4_ext_path *path; 38 39 39 - *path = ext4_ext_find_extent(inode, lblock, *path); 40 - if (IS_ERR(*path)) { 41 - ret = PTR_ERR(*path); 42 - *path = NULL; 43 - } else if ((*path)[ext_depth(inode)].p_ext == NULL) 40 + path = ext4_ext_find_extent(inode, lblock, *orig_path); 41 + if (IS_ERR(path)) 42 + ret = PTR_ERR(path); 43 + else if (path[ext_depth(inode)].p_ext == NULL) 44 44 ret = -ENODATA; 45 + else 46 + *orig_path = path; 45 47 46 48 return ret; 47 49 } ··· 613 611 { 614 612 struct ext4_ext_path *path = NULL; 615 613 struct ext4_extent *ext; 614 + int ret = 0; 616 615 ext4_lblk_t last = from + count; 617 616 while (from < last) { 618 617 *err = get_ext_path(inode, from, &path); 619 618 if (*err) 620 - return 0; 619 + goto out; 621 620 ext = path[ext_depth(inode)].p_ext; 622 - if (!ext) { 623 - ext4_ext_drop_refs(path); 624 - return 0; 625 - } 626 - if (uninit != ext4_ext_is_uninitialized(ext)) { 627 - ext4_ext_drop_refs(path); 628 - return 0; 629 - } 621 + if (uninit != ext4_ext_is_uninitialized(ext)) 622 + goto out; 630 623 from += ext4_ext_get_actual_len(ext); 631 624 ext4_ext_drop_refs(path); 632 625 } 633 - return 1; 626 + ret = 1; 627 + out: 628 + if (path) { 629 + ext4_ext_drop_refs(path); 630 + kfree(path); 631 + } 632 + return ret; 634 633 } 635 634 636 635 /** ··· 668 665 int depth; 669 666 int replaced_count = 0; 670 667 int dext_alen; 668 + 669 + *err = ext4_es_remove_extent(orig_inode, from, count); 670 + if (*err) 671 + goto out; 672 + 673 + *err = ext4_es_remove_extent(donor_inode, from, count); 674 + if (*err) 675 + goto out; 671 676 672 677 /* Get the original extent for the block "orig_off" */ 673 678 *err = get_ext_path(orig_inode, orig_off, &orig_path);
+11 -1
fs/ext4/page-io.c
··· 50 50 kmem_cache_destroy(io_page_cachep); 51 51 } 52 52 53 - void ext4_ioend_wait(struct inode *inode) 53 + /* 54 + * This function is called by ext4_evict_inode() to make sure there is 55 + * no more pending I/O completion work left to do. 56 + */ 57 + void ext4_ioend_shutdown(struct inode *inode) 54 58 { 55 59 wait_queue_head_t *wq = ext4_ioend_wq(inode); 56 60 57 61 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 62 + /* 63 + * We need to make sure the work structure is finished being 64 + * used before we let the inode get destroyed. 65 + */ 66 + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 67 + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 58 68 } 59 69 60 70 static void put_io_page(struct ext4_io_page *io_page)
+2 -2
fs/ext4/resize.c
··· 1360 1360 sbi->s_log_groups_per_flex) { 1361 1361 ext4_group_t flex_group; 1362 1362 flex_group = ext4_flex_group(sbi, group_data[0].group); 1363 - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), 1364 - &sbi->s_flex_groups[flex_group].free_clusters); 1363 + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), 1364 + &sbi->s_flex_groups[flex_group].free_clusters); 1365 1365 atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, 1366 1366 &sbi->s_flex_groups[flex_group].free_inodes); 1367 1367 }
+2 -2
fs/ext4/super.c
··· 1927 1927 flex_group = ext4_flex_group(sbi, i); 1928 1928 atomic_add(ext4_free_inodes_count(sb, gdp), 1929 1929 &sbi->s_flex_groups[flex_group].free_inodes); 1930 - atomic_add(ext4_free_group_clusters(sb, gdp), 1931 - &sbi->s_flex_groups[flex_group].free_clusters); 1930 + atomic64_add(ext4_free_group_clusters(sb, gdp), 1931 + &sbi->s_flex_groups[flex_group].free_clusters); 1932 1932 atomic_add(ext4_used_dirs_count(sb, gdp), 1933 1933 &sbi->s_flex_groups[flex_group].used_dirs); 1934 1934 }
+10 -5
fs/jbd2/transaction.c
··· 1065 1065 void jbd2_journal_set_triggers(struct buffer_head *bh, 1066 1066 struct jbd2_buffer_trigger_type *type) 1067 1067 { 1068 - struct journal_head *jh = bh2jh(bh); 1068 + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); 1069 1069 1070 + if (WARN_ON(!jh)) 1071 + return; 1070 1072 jh->b_triggers = type; 1073 + jbd2_journal_put_journal_head(jh); 1071 1074 } 1072 1075 1073 1076 void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, ··· 1122 1119 { 1123 1120 transaction_t *transaction = handle->h_transaction; 1124 1121 journal_t *journal = transaction->t_journal; 1125 - struct journal_head *jh = bh2jh(bh); 1122 + struct journal_head *jh; 1126 1123 int ret = 0; 1127 1124 1128 - jbd_debug(5, "journal_head %p\n", jh); 1129 - JBUFFER_TRACE(jh, "entry"); 1130 1125 if (is_handle_aborted(handle)) 1131 1126 goto out; 1132 - if (!buffer_jbd(bh)) { 1127 + jh = jbd2_journal_grab_journal_head(bh); 1128 + if (!jh) { 1133 1129 ret = -EUCLEAN; 1134 1130 goto out; 1135 1131 } 1132 + jbd_debug(5, "journal_head %p\n", jh); 1133 + JBUFFER_TRACE(jh, "entry"); 1136 1134 1137 1135 jbd_lock_bh_state(bh); 1138 1136 ··· 1224 1220 spin_unlock(&journal->j_list_lock); 1225 1221 out_unlock_bh: 1226 1222 jbd_unlock_bh_state(bh); 1223 + jbd2_journal_put_journal_head(jh); 1227 1224 out: 1228 1225 JBUFFER_TRACE(jh, "exit"); 1229 1226 WARN_ON(ret); /* All errors are bugs, so dump the stack */