Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
"Various bug fixes for ext4 fast commit and inline data handling.

Also fix regression introduced as part of moving to the new mount API"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
fs/ext4: fix comments mentioning i_mutex
ext4: fix incorrect type issue during replay_del_range
jbd2: fix kernel-doc descriptions for jbd2_journal_shrink_{scan,count}()
ext4: fix potential NULL pointer dereference in ext4_fill_super()
jbd2: refactor wait logic for transaction updates into a common function
jbd2: cleanup unused functions declarations from jbd2.h
ext4: fix error handling in ext4_fc_record_modified_inode()
ext4: remove redundant max inline_size check in ext4_da_write_inline_data_begin()
ext4: fix error handling in ext4_restore_inline_data()
ext4: fast commit may miss file actions
ext4: fast commit may not fallback for ineligible commit
ext4: modify the logic of ext4_mb_new_blocks_simple
ext4: prevent used blocks from being allocated during fast commit replay

Linus Torvalds 4 years ago d8ad2ce8 18118a42

+196 -160

19 changed files

expand all collapse all

ext4

acl.c

ext4.h

ext4_jbd2.h

extents.c

fast_commit.c

indirect.c

inline.c

inode.c

ioctl.c

mballoc.c

migrate.c

namei.c

orphan.c

super.c

xattr.c

jbd2

commit.c

journal.c

transaction.c

include

linux

jbd2.h

+4 -4

fs/ext4/acl.c

reviewed

··· 139 139 /* 140 140 * Inode operation get_posix_acl(). 141 141 * 142 142 - * inode->i_mutex: don't care 142 142 + * inode->i_rwsem: don't care 143 143 */ 144 144 struct posix_acl * 145 145 ext4_get_acl(struct inode *inode, int type, bool rcu) ··· 183 183 /* 184 184 * Set the access or default ACL of an inode. 185 185 * 186 186 - * inode->i_mutex: down unless called from ext4_new_inode 186 186 + * inode->i_rwsem: down unless called from ext4_new_inode 187 187 */ 188 188 static int 189 189 __ext4_set_acl(handle_t *handle, struct inode *inode, int type, ··· 271 271 /* 272 272 * Initialize the ACLs of a new inode. Called from ext4_new_inode. 273 273 * 274 274 - * dir->i_mutex: down 275 275 - * inode->i_mutex: up (access to inode is still exclusive) 274 274 + * dir->i_rwsem: down 275 275 + * inode->i_rwsem: up (access to inode is still exclusive) 276 276 */ 277 277 int 278 278 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)

+9 -8

fs/ext4/ext4.h

reviewed

··· 1028 1028 1029 1029 /* 1030 1030 * Extended attributes can be read independently of the main file 1031 1031 - * data. Taking i_mutex even when reading would cause contention 1031 1031 + * data. Taking i_rwsem even when reading would cause contention 1032 1032 * between readers of EAs and writers of regular file data, so 1033 1033 * instead we synchronize on xattr_sem when reading or changing 1034 1034 * EAs. ··· 1750 1750 spinlock_t s_fc_lock; 1751 1751 struct buffer_head *s_fc_bh; 1752 1752 struct ext4_fc_stats s_fc_stats; 1753 1753 + tid_t s_fc_ineligible_tid; 1753 1754 #ifdef CONFIG_EXT4_DEBUG 1754 1755 int s_fc_debug_max_replay; 1755 1756 #endif ··· 1796 1795 enum { 1797 1796 EXT4_MF_MNTDIR_SAMPLED, 1798 1797 EXT4_MF_FS_ABORTED, /* Fatal error detected */ 1799 1799 - EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ 1800 1800 - EXT4_MF_FC_COMMITTING /* File system underoing a fast 1801 1801 - * commit. 1802 1802 - */ 1798 1798 + EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ 1803 1799 }; 1804 1800 1805 1801 static inline void ext4_set_mount_flag(struct super_block *sb, int bit) ··· 2924 2926 struct dentry *dentry); 2925 2927 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); 2926 2928 void ext4_fc_track_inode(handle_t *handle, struct inode *inode); 2927 2927 - void ext4_fc_mark_ineligible(struct super_block *sb, int reason); 2929 2929 + void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); 2928 2930 void ext4_fc_start_update(struct inode *inode); 2929 2931 void ext4_fc_stop_update(struct inode *inode); 2930 2932 void ext4_fc_del(struct inode *inode); ··· 2933 2935 int ext4_fc_commit(journal_t *journal, tid_t commit_tid); 2934 2936 int __init ext4_fc_init_dentry_cache(void); 2935 2937 void ext4_fc_destroy_dentry_cache(void); 2938 2938 + int ext4_fc_record_regions(struct super_block *sb, int ino, 2939 2939 + ext4_lblk_t lblk, ext4_fsblk_t pblk, 2940 2940 + int len, int replay); 2936 2941 2937 2942 /* mballoc.c */ 2938 2943 extern const struct seq_operations ext4_mb_seq_groups_ops; ··· 3408 3407 #define EXT4_FREECLUSTERS_WATERMARK 0 3409 3408 #endif 3410 3409 3411 3411 - /* Update i_disksize. Requires i_mutex to avoid races with truncate */ 3410 3410 + /* Update i_disksize. Requires i_rwsem to avoid races with truncate */ 3412 3411 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) 3413 3412 { 3414 3413 WARN_ON_ONCE(S_ISREG(inode->i_mode) && ··· 3419 3418 up_write(&EXT4_I(inode)->i_data_sem); 3420 3419 } 3421 3420 3422 3422 - /* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ 3421 3421 + /* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ 3423 3422 static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) 3424 3423 { 3425 3424 int changed = 0;

+1 -1

fs/ext4/ext4_jbd2.h

reviewed

··· 491 491 /* 492 492 * This function controls whether or not we should try to go down the 493 493 * dioread_nolock code paths, which makes it safe to avoid taking 494 494 - * i_mutex for direct I/O reads. This only works for extent-based 494 494 + * i_rwsem for direct I/O reads. This only works for extent-based 495 495 * files, and it doesn't work if data journaling is enabled, since the 496 496 * dioread_nolock code uses b_private to pass information back to the 497 497 * I/O completion handler, and this conflicts with the jbd's use of

+10 -6

fs/ext4/extents.c

reviewed

··· 97 97 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this 98 98 * moment, get_block can be called only for blocks inside i_size since 99 99 * page cache has been already dropped and writes are blocked by 100 100 - * i_mutex. So we can safely drop the i_data_sem here. 100 100 + * i_rwsem. So we can safely drop the i_data_sem here. 101 101 */ 102 102 BUG_ON(EXT4_JOURNAL(inode) == NULL); 103 103 ext4_discard_preallocations(inode, 0); ··· 4572 4572 4573 4573 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4574 4574 4575 4575 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 4575 4575 + /* Wait all existing dio workers, newcomers will block on i_rwsem */ 4576 4576 inode_dio_wait(inode); 4577 4577 4578 4578 /* Preallocate the range including the unaligned edges */ ··· 4738 4738 goto out; 4739 4739 } 4740 4740 4741 4741 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 4741 4741 + /* Wait all existing dio workers, newcomers will block on i_rwsem */ 4742 4742 inode_dio_wait(inode); 4743 4743 4744 4744 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); ··· 5334 5334 ret = PTR_ERR(handle); 5335 5335 goto out_mmap; 5336 5336 } 5337 5337 - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); 5337 5337 + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); 5338 5338 5339 5339 down_write(&EXT4_I(inode)->i_data_sem); 5340 5340 ext4_discard_preallocations(inode, 0); ··· 5474 5474 ret = PTR_ERR(handle); 5475 5475 goto out_mmap; 5476 5476 } 5477 5477 - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); 5477 5477 + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); 5478 5478 5479 5479 /* Expand file to avoid data loss if there is error while shifting */ 5480 5480 inode->i_size += len; ··· 5571 5571 * stuff such as page-cache locking consistency, bh mapping consistency or 5572 5572 * extent's data copying must be performed by caller. 5573 5573 * Locking: 5574 5574 - * i_mutex is held for both inodes 5574 5574 + * i_rwsem is held for both inodes 5575 5575 * i_data_sem is locked for write for both inodes 5576 5576 * Assumptions: 5577 5577 * All pages from requested range are locked for both inodes ··· 6091 6091 6092 6092 ext4_mb_mark_bb(inode->i_sb, 6093 6093 path[j].p_block, 1, 0); 6094 6094 + ext4_fc_record_regions(inode->i_sb, inode->i_ino, 6095 6095 + 0, path[j].p_block, 1, 1); 6094 6096 } 6095 6097 ext4_ext_drop_refs(path); 6096 6098 kfree(path); 6097 6099 } 6098 6100 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 6101 6101 + ext4_fc_record_regions(inode->i_sb, inode->i_ino, 6102 6102 + map.m_lblk, map.m_pblk, map.m_len, 1); 6099 6103 } 6100 6104 cur = cur + map.m_len; 6101 6105 }

+78 -55

fs/ext4/fast_commit.c

reviewed

··· 300 300 } 301 301 302 302 /* 303 303 - * Mark file system as fast commit ineligible. This means that next commit 304 304 - * operation would result in a full jbd2 commit. 303 303 + * Mark file system as fast commit ineligible, and record latest 304 304 + * ineligible transaction tid. This means until the recorded 305 305 + * transaction, commit operation would result in a full jbd2 commit. 305 306 */ 306 306 - void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 307 307 + void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 307 308 { 308 309 struct ext4_sb_info *sbi = EXT4_SB(sb); 310 310 + tid_t tid; 309 311 310 312 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 311 313 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 312 314 return; 313 315 314 316 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 317 317 + if (handle && !IS_ERR(handle)) 318 318 + tid = handle->h_transaction->t_tid; 319 319 + else { 320 320 + read_lock(&sbi->s_journal->j_state_lock); 321 321 + tid = sbi->s_journal->j_running_transaction ? 322 322 + sbi->s_journal->j_running_transaction->t_tid : 0; 323 323 + read_unlock(&sbi->s_journal->j_state_lock); 324 324 + } 325 325 + spin_lock(&sbi->s_fc_lock); 326 326 + if (sbi->s_fc_ineligible_tid < tid) 327 327 + sbi->s_fc_ineligible_tid = tid; 328 328 + spin_unlock(&sbi->s_fc_lock); 315 329 WARN_ON(reason >= EXT4_FC_REASON_MAX); 316 330 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 317 331 } ··· 375 361 spin_lock(&sbi->s_fc_lock); 376 362 if (list_empty(&EXT4_I(inode)->i_fc_list)) 377 363 list_add_tail(&EXT4_I(inode)->i_fc_list, 378 378 - (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 364 364 + (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 365 365 + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 379 366 &sbi->s_fc_q[FC_Q_STAGING] : 380 367 &sbi->s_fc_q[FC_Q_MAIN]); 381 368 spin_unlock(&sbi->s_fc_lock); ··· 402 387 mutex_unlock(&ei->i_fc_lock); 403 388 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 404 389 if (!node) { 405 405 - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 390 390 + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 406 391 mutex_lock(&ei->i_fc_lock); 407 392 return -ENOMEM; 408 393 } ··· 415 400 if (!node->fcd_name.name) { 416 401 kmem_cache_free(ext4_fc_dentry_cachep, node); 417 402 ext4_fc_mark_ineligible(inode->i_sb, 418 418 - EXT4_FC_REASON_NOMEM); 403 403 + EXT4_FC_REASON_NOMEM, NULL); 419 404 mutex_lock(&ei->i_fc_lock); 420 405 return -ENOMEM; 421 406 } ··· 429 414 node->fcd_name.len = dentry->d_name.len; 430 415 431 416 spin_lock(&sbi->s_fc_lock); 432 432 - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 417 417 + if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 418 418 + sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 433 419 list_add_tail(&node->fcd_list, 434 420 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 435 421 else ··· 518 502 519 503 if (ext4_should_journal_data(inode)) { 520 504 ext4_fc_mark_ineligible(inode->i_sb, 521 521 - EXT4_FC_REASON_INODE_JOURNAL_DATA); 505 505 + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 522 506 return; 523 507 } 524 508 ··· 895 879 int ret = 0; 896 880 897 881 spin_lock(&sbi->s_fc_lock); 898 898 - ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 899 882 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 900 883 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 901 884 while (atomic_read(&ei->i_fc_updates)) { ··· 1194 1179 * Fast commit cleanup routine. This is called after every fast commit and 1195 1180 * full commit. full is true if we are called after a full commit. 1196 1181 */ 1197 1197 - static void ext4_fc_cleanup(journal_t *journal, int full) 1182 1182 + static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1198 1183 { 1199 1184 struct super_block *sb = journal->j_private; 1200 1185 struct ext4_sb_info *sbi = EXT4_SB(sb); ··· 1212 1197 list_del_init(&iter->i_fc_list); 1213 1198 ext4_clear_inode_state(&iter->vfs_inode, 1214 1199 EXT4_STATE_FC_COMMITTING); 1215 1215 - ext4_fc_reset_inode(&iter->vfs_inode); 1200 1200 + if (iter->i_sync_tid <= tid) 1201 1201 + ext4_fc_reset_inode(&iter->vfs_inode); 1216 1202 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1217 1203 smp_mb(); 1218 1204 #if (BITS_PER_LONG < 64) ··· 1242 1226 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1243 1227 &sbi->s_fc_q[FC_Q_MAIN]); 1244 1228 1245 1245 - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1246 1246 - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1229 1229 + if (tid >= sbi->s_fc_ineligible_tid) { 1230 1230 + sbi->s_fc_ineligible_tid = 0; 1231 1231 + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1232 1232 + } 1247 1233 1248 1234 if (full) 1249 1235 sbi->s_fc_bytes = 0; ··· 1410 1392 if (state->fc_modified_inodes[i] == ino) 1411 1393 return 0; 1412 1394 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1413 1413 - state->fc_modified_inodes_size += 1414 1414 - EXT4_FC_REPLAY_REALLOC_INCREMENT; 1415 1395 state->fc_modified_inodes = krealloc( 1416 1416 - state->fc_modified_inodes, sizeof(int) * 1417 1417 - state->fc_modified_inodes_size, 1418 1418 - GFP_KERNEL); 1396 1396 + state->fc_modified_inodes, 1397 1397 + sizeof(int) * (state->fc_modified_inodes_size + 1398 1398 + EXT4_FC_REPLAY_REALLOC_INCREMENT), 1399 1399 + GFP_KERNEL); 1419 1400 if (!state->fc_modified_inodes) 1420 1401 return -ENOMEM; 1402 1402 + state->fc_modified_inodes_size += 1403 1403 + EXT4_FC_REPLAY_REALLOC_INCREMENT; 1421 1404 } 1422 1405 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1423 1406 return 0; ··· 1450 1431 } 1451 1432 inode = NULL; 1452 1433 1453 1453 - ext4_fc_record_modified_inode(sb, ino); 1434 1434 + ret = ext4_fc_record_modified_inode(sb, ino); 1435 1435 + if (ret) 1436 1436 + goto out; 1454 1437 1455 1438 raw_fc_inode = (struct ext4_inode *) 1456 1439 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ··· 1584 1563 } 1585 1564 1586 1565 /* 1587 1587 - * Record physical disk regions which are in use as per fast commit area. Our 1588 1588 - * simple replay phase allocator excludes these regions from allocation. 1566 1566 + * Record physical disk regions which are in use as per fast commit area, 1567 1567 + * and used by inodes during replay phase. Our simple replay phase 1568 1568 + * allocator excludes these regions from allocation. 1589 1569 */ 1590 1590 - static int ext4_fc_record_regions(struct super_block *sb, int ino, 1591 1591 - ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1570 1570 + int ext4_fc_record_regions(struct super_block *sb, int ino, 1571 1571 + ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1592 1572 { 1593 1573 struct ext4_fc_replay_state *state; 1594 1574 struct ext4_fc_alloc_region *region; 1595 1575 1596 1576 state = &EXT4_SB(sb)->s_fc_replay_state; 1577 1577 + /* 1578 1578 + * during replay phase, the fc_regions_valid may not same as 1579 1579 + * fc_regions_used, update it when do new additions. 1580 1580 + */ 1581 1581 + if (replay && state->fc_regions_used != state->fc_regions_valid) 1582 1582 + state->fc_regions_used = state->fc_regions_valid; 1597 1583 if (state->fc_regions_used == state->fc_regions_size) { 1598 1584 state->fc_regions_size += 1599 1585 EXT4_FC_REPLAY_REALLOC_INCREMENT; ··· 1617 1589 region->lblk = lblk; 1618 1590 region->pblk = pblk; 1619 1591 region->len = len; 1592 1592 + 1593 1593 + if (replay) 1594 1594 + state->fc_regions_valid++; 1620 1595 1621 1596 return 0; 1622 1597 } ··· 1652 1621 } 1653 1622 1654 1623 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1624 1624 + if (ret) 1625 1625 + goto out; 1655 1626 1656 1627 start = le32_to_cpu(ex->ee_block); 1657 1628 start_pblk = ext4_ext_pblock(ex); ··· 1671 1638 map.m_pblk = 0; 1672 1639 ret = ext4_map_blocks(NULL, inode, &map, 0); 1673 1640 1674 1674 - if (ret < 0) { 1675 1675 - iput(inode); 1676 1676 - return 0; 1677 1677 - } 1641 1641 + if (ret < 0) 1642 1642 + goto out; 1678 1643 1679 1644 if (ret == 0) { 1680 1645 /* Range is not mapped */ 1681 1646 path = ext4_find_extent(inode, cur, NULL, 0); 1682 1682 - if (IS_ERR(path)) { 1683 1683 - iput(inode); 1684 1684 - return 0; 1685 1685 - } 1647 1647 + if (IS_ERR(path)) 1648 1648 + goto out; 1686 1649 memset(&newex, 0, sizeof(newex)); 1687 1650 newex.ee_block = cpu_to_le32(cur); 1688 1651 ext4_ext_store_pblock( ··· 1692 1663 up_write((&EXT4_I(inode)->i_data_sem)); 1693 1664 ext4_ext_drop_refs(path); 1694 1665 kfree(path); 1695 1695 - if (ret) { 1696 1696 - iput(inode); 1697 1697 - return 0; 1698 1698 - } 1666 1666 + if (ret) 1667 1667 + goto out; 1699 1668 goto next; 1700 1669 } 1701 1670 ··· 1706 1679 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1707 1680 ext4_ext_is_unwritten(ex), 1708 1681 start_pblk + cur - start); 1709 1709 - if (ret) { 1710 1710 - iput(inode); 1711 1711 - return 0; 1712 1712 - } 1682 1682 + if (ret) 1683 1683 + goto out; 1713 1684 /* 1714 1685 * Mark the old blocks as free since they aren't used 1715 1686 * anymore. We maintain an array of all the modified ··· 1727 1702 ext4_ext_is_unwritten(ex), map.m_pblk); 1728 1703 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1729 1704 ext4_ext_is_unwritten(ex), map.m_pblk); 1730 1730 - if (ret) { 1731 1731 - iput(inode); 1732 1732 - return 0; 1733 1733 - } 1705 1705 + if (ret) 1706 1706 + goto out; 1734 1707 /* 1735 1708 * We may have split the extent tree while toggling the state. 1736 1709 * Try to shrink the extent tree now. ··· 1740 1717 } 1741 1718 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1742 1719 sb->s_blocksize_bits); 1720 1720 + out: 1743 1721 iput(inode); 1744 1722 return 0; 1745 1723 } ··· 1770 1746 } 1771 1747 1772 1748 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1749 1749 + if (ret) 1750 1750 + goto out; 1773 1751 1774 1752 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1775 1753 inode->i_ino, le32_to_cpu(lrange.fc_lblk), ··· 1781 1755 map.m_len = remaining; 1782 1756 1783 1757 ret = ext4_map_blocks(NULL, inode, &map, 0); 1784 1784 - if (ret < 0) { 1785 1785 - iput(inode); 1786 1786 - return 0; 1787 1787 - } 1758 1758 + if (ret < 0) 1759 1759 + goto out; 1788 1760 if (ret > 0) { 1789 1761 remaining -= ret; 1790 1762 cur += ret; ··· 1794 1770 } 1795 1771 1796 1772 down_write(&EXT4_I(inode)->i_data_sem); 1797 1797 - ret = ext4_ext_remove_space(inode, lrange.fc_lblk, 1798 1798 - lrange.fc_lblk + lrange.fc_len - 1); 1773 1773 + ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1774 1774 + le32_to_cpu(lrange.fc_lblk) + 1775 1775 + le32_to_cpu(lrange.fc_len) - 1); 1799 1776 up_write(&EXT4_I(inode)->i_data_sem); 1800 1800 - if (ret) { 1801 1801 - iput(inode); 1802 1802 - return 0; 1803 1803 - } 1777 1777 + if (ret) 1778 1778 + goto out; 1804 1779 ext4_ext_replay_shrink_inode(inode, 1805 1780 i_size_read(inode) >> sb->s_blocksize_bits); 1806 1781 ext4_mark_inode_dirty(NULL, inode); 1782 1782 + out: 1807 1783 iput(inode); 1808 1808 - 1809 1784 return 0; 1810 1785 } 1811 1786 ··· 1960 1937 ret = ext4_fc_record_regions(sb, 1961 1938 le32_to_cpu(ext.fc_ino), 1962 1939 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 1963 1963 - ext4_ext_get_actual_len(ex)); 1940 1940 + ext4_ext_get_actual_len(ex), 0); 1964 1941 if (ret < 0) 1965 1942 break; 1966 1943 ret = JBD2_FC_REPLAY_CONTINUE;

+1 -1

fs/ext4/indirect.c

reviewed

··· 696 696 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this 697 697 * moment, get_block can be called only for blocks inside i_size since 698 698 * page cache has been already dropped and writes are blocked by 699 699 - * i_mutex. So we can safely drop the i_data_sem here. 699 699 + * i_rwsem. So we can safely drop the i_data_sem here. 700 700 */ 701 701 BUG_ON(EXT4_JOURNAL(inode) == NULL); 702 702 ext4_discard_preallocations(inode, 0);

+13 -10

fs/ext4/inline.c

reviewed

··· 911 911 struct page **pagep, 912 912 void **fsdata) 913 913 { 914 914 - int ret, inline_size; 914 914 + int ret; 915 915 handle_t *handle; 916 916 struct page *page; 917 917 struct ext4_iloc iloc; ··· 928 928 goto out; 929 929 } 930 930 931 931 - inline_size = ext4_get_max_inline_size(inode); 932 932 - 933 933 - ret = -ENOSPC; 934 934 - if (inline_size >= pos + len) { 935 935 - ret = ext4_prepare_inline_data(handle, inode, pos + len); 936 936 - if (ret && ret != -ENOSPC) 937 937 - goto out_journal; 938 938 - } 931 931 + ret = ext4_prepare_inline_data(handle, inode, pos + len); 932 932 + if (ret && ret != -ENOSPC) 933 933 + goto out_journal; 939 934 940 935 /* 941 936 * We cannot recurse into the filesystem as the transaction ··· 1128 1133 struct ext4_iloc *iloc, 1129 1134 void *buf, int inline_size) 1130 1135 { 1131 1131 - ext4_create_inline_data(handle, inode, inline_size); 1136 1136 + int ret; 1137 1137 + 1138 1138 + ret = ext4_create_inline_data(handle, inode, inline_size); 1139 1139 + if (ret) { 1140 1140 + ext4_msg(inode->i_sb, KERN_EMERG, 1141 1141 + "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", 1142 1142 + inode->i_ino, ret); 1143 1143 + return; 1144 1144 + } 1132 1145 ext4_write_inline_data(inode, iloc, buf, 0, inline_size); 1133 1146 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1134 1147 }

+6 -6

fs/ext4/inode.c

reviewed

··· 338 338 return; 339 339 no_delete: 340 340 if (!list_empty(&EXT4_I(inode)->i_fc_list)) 341 341 - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 341 341 + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); 342 342 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 343 343 } 344 344 ··· 1224 1224 /* 1225 1225 * __block_write_begin may have instantiated a few blocks 1226 1226 * outside i_size. Trim these off again. Don't need 1227 1227 - * i_size_read because we hold i_mutex. 1227 1227 + * i_size_read because we hold i_rwsem. 1228 1228 * 1229 1229 * Add inode to orphan list in case we crash before 1230 1230 * truncate finishes ··· 3979 3979 3980 3980 } 3981 3981 3982 3982 - /* Wait all existing dio workers, newcomers will block on i_mutex */ 3982 3982 + /* Wait all existing dio workers, newcomers will block on i_rwsem */ 3983 3983 inode_dio_wait(inode); 3984 3984 3985 3985 /* ··· 4129 4129 /* 4130 4130 * There is a possibility that we're either freeing the inode 4131 4131 * or it's a completely new inode. In those cases we might not 4132 4132 - * have i_mutex locked because it's not necessary. 4132 4132 + * have i_rwsem locked because it's not necessary. 4133 4133 */ 4134 4134 if (!(inode->i_state & (I_NEW|I_FREEING))) 4135 4135 WARN_ON(!inode_is_locked(inode)); ··· 5271 5271 * transaction are already on disk (truncate waits for pages under 5272 5272 * writeback). 5273 5273 * 5274 5274 - * Called with inode->i_mutex down. 5274 5274 + * Called with inode->i_rwsem down. 5275 5275 */ 5276 5276 int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, 5277 5277 struct iattr *attr) ··· 5983 5983 return PTR_ERR(handle); 5984 5984 5985 5985 ext4_fc_mark_ineligible(inode->i_sb, 5986 5986 - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); 5986 5986 + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); 5987 5987 err = ext4_mark_inode_dirty(handle, inode); 5988 5988 ext4_handle_sync(handle); 5989 5989 ext4_journal_stop(handle);

+2 -2

fs/ext4/ioctl.c

reviewed

··· 411 411 err = -EINVAL; 412 412 goto err_out; 413 413 } 414 414 - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); 414 414 + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); 415 415 416 416 /* Protect extent tree against block allocations via delalloc */ 417 417 ext4_double_down_write_data_sem(inode, inode_bl); ··· 1373 1373 1374 1374 err = ext4_resize_fs(sb, n_blocks_count); 1375 1375 if (EXT4_SB(sb)->s_journal) { 1376 1376 - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); 1376 1376 + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); 1377 1377 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 1378 1378 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 1379 1379 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);

+17 -9

fs/ext4/mballoc.c

reviewed

··· 5753 5753 struct super_block *sb = ar->inode->i_sb; 5754 5754 ext4_group_t group; 5755 5755 ext4_grpblk_t blkoff; 5756 5756 - int i = sb->s_blocksize; 5756 5756 + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); 5757 5757 + ext4_grpblk_t i = 0; 5757 5758 ext4_fsblk_t goal, block; 5758 5759 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 5759 5760 ··· 5776 5775 ext4_get_group_no_and_offset(sb, 5777 5776 max(ext4_group_first_block_no(sb, group), goal), 5778 5777 NULL, &blkoff); 5779 5779 - i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize, 5778 5778 + while (1) { 5779 5779 + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, 5780 5780 blkoff); 5781 5781 + if (i >= max) 5782 5782 + break; 5783 5783 + if (ext4_fc_replay_check_excluded(sb, 5784 5784 + ext4_group_first_block_no(sb, group) + i)) { 5785 5785 + blkoff = i + 1; 5786 5786 + } else 5787 5787 + break; 5788 5788 + } 5781 5789 brelse(bitmap_bh); 5782 5782 - if (i >= sb->s_blocksize) 5783 5783 - continue; 5784 5784 - if (ext4_fc_replay_check_excluded(sb, 5785 5785 - ext4_group_first_block_no(sb, group) + i)) 5786 5786 - continue; 5787 5787 - break; 5790 5790 + if (i < max) 5791 5791 + break; 5788 5792 } 5789 5793 5790 5790 - if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize) 5794 5794 + if (group >= ext4_get_groups_count(sb) || i >= max) { 5795 5795 + *errp = -ENOSPC; 5791 5796 return 0; 5797 5797 + } 5792 5798 5793 5799 block = ext4_group_first_block_no(sb, group) + i; 5794 5800 ext4_mb_mark_bb(sb, block, 1, 1);

+1 -1

fs/ext4/migrate.c

reviewed

··· 485 485 * when we add extents we extent the journal 486 486 */ 487 487 /* 488 488 - * Even though we take i_mutex we can still cause block 488 488 + * Even though we take i_rwsem we can still cause block 489 489 * allocation via mmap write to holes. If we have allocated 490 490 * new blocks we fail migrate. New block allocation will 491 491 * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated

+2 -2

fs/ext4/namei.c

reviewed

··· 3889 3889 * dirents in directories. 3890 3890 */ 3891 3891 ext4_fc_mark_ineligible(old.inode->i_sb, 3892 3892 - EXT4_FC_REASON_RENAME_DIR); 3892 3892 + EXT4_FC_REASON_RENAME_DIR, handle); 3893 3893 } else { 3894 3894 if (new.inode) 3895 3895 ext4_fc_track_unlink(handle, new.dentry); ··· 4049 4049 if (unlikely(retval)) 4050 4050 goto end_rename; 4051 4051 ext4_fc_mark_ineligible(new.inode->i_sb, 4052 4052 - EXT4_FC_REASON_CROSS_RENAME); 4052 4052 + EXT4_FC_REASON_CROSS_RENAME, handle); 4053 4053 if (old.dir_bh) { 4054 4054 retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); 4055 4055 if (retval)

+2 -2

fs/ext4/orphan.c

reviewed

··· 93 93 * At filesystem recovery time, we walk this list deleting unlinked 94 94 * inodes and truncating linked inodes in ext4_orphan_cleanup(). 95 95 * 96 96 - * Orphan list manipulation functions must be called under i_mutex unless 96 96 + * Orphan list manipulation functions must be called under i_rwsem unless 97 97 * we are just creating the inode or deleting it. 98 98 */ 99 99 int ext4_orphan_add(handle_t *handle, struct inode *inode) ··· 119 119 /* 120 120 * Orphan handling is only valid for files with data blocks 121 121 * being truncated, or files being unlinked. Note that we either 122 122 - * hold i_mutex, or the inode can not be referenced from outside, 122 122 + * hold i_rwsem, or the inode can not be referenced from outside, 123 123 * so i_nlink should not be bumped due to race 124 124 */ 125 125 ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||

+2 -2

fs/ext4/super.c

reviewed

··· 5082 5082 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); 5083 5083 sbi->s_fc_bytes = 0; 5084 5084 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 5085 5085 - ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 5085 5085 + sbi->s_fc_ineligible_tid = 0; 5086 5086 spin_lock_init(&sbi->s_fc_lock); 5087 5087 memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); 5088 5088 sbi->s_fc_replay_state.fc_regions = NULL; ··· 5540 5540 5541 5541 sbi = ext4_alloc_sbi(sb); 5542 5542 if (!sbi) 5543 5543 - ret = -ENOMEM; 5543 5543 + return -ENOMEM; 5544 5544 5545 5545 fc->s_fs_info = sbi; 5546 5546

+3 -3

fs/ext4/xattr.c

reviewed

··· 2408 2408 if (IS_SYNC(inode)) 2409 2409 ext4_handle_sync(handle); 2410 2410 } 2411 2411 - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); 2411 2411 + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); 2412 2412 2413 2413 cleanup: 2414 2414 brelse(is.iloc.bh); ··· 2486 2486 if (error == 0) 2487 2487 error = error2; 2488 2488 } 2489 2489 - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); 2489 2489 + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); 2490 2490 2491 2491 return error; 2492 2492 } ··· 2920 2920 error); 2921 2921 goto cleanup; 2922 2922 } 2923 2923 - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); 2923 2923 + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); 2924 2924 } 2925 2925 error = 0; 2926 2926 cleanup:

+4 -17

fs/jbd2/commit.c

reviewed

··· 484 484 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 485 485 stats.run.rs_locked); 486 486 487 487 - spin_lock(&commit_transaction->t_handle_lock); 488 488 - while (atomic_read(&commit_transaction->t_updates)) { 489 489 - DEFINE_WAIT(wait); 487 487 + // waits for any t_updates to finish 488 488 + jbd2_journal_wait_updates(journal); 490 489 491 491 - prepare_to_wait(&journal->j_wait_updates, &wait, 492 492 - TASK_UNINTERRUPTIBLE); 493 493 - if (atomic_read(&commit_transaction->t_updates)) { 494 494 - spin_unlock(&commit_transaction->t_handle_lock); 495 495 - write_unlock(&journal->j_state_lock); 496 496 - schedule(); 497 497 - write_lock(&journal->j_state_lock); 498 498 - spin_lock(&commit_transaction->t_handle_lock); 499 499 - } 500 500 - finish_wait(&journal->j_wait_updates, &wait); 501 501 - } 502 502 - spin_unlock(&commit_transaction->t_handle_lock); 503 490 commit_transaction->t_state = T_SWITCH; 504 491 write_unlock(&journal->j_state_lock); 505 492 ··· 804 817 commit_transaction->t_state = T_COMMIT_DFLUSH; 805 818 write_unlock(&journal->j_state_lock); 806 819 807 807 - /* 820 820 + /* 808 821 * If the journal is not located on the file system device, 809 822 * then we must flush the file system device before we issue 810 823 * the commit record ··· 1157 1170 if (journal->j_commit_callback) 1158 1171 journal->j_commit_callback(journal, commit_transaction); 1159 1172 if (journal->j_fc_cleanup_callback) 1160 1160 - journal->j_fc_cleanup_callback(journal, 1); 1173 1173 + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); 1161 1174 1162 1175 trace_jbd2_end_commit(journal, commit_transaction); 1163 1176 jbd_debug(1, "JBD2: commit %d complete, head %d\n",

+5 -1

fs/jbd2/journal.c

reviewed

··· 771 771 { 772 772 jbd2_journal_unlock_updates(journal); 773 773 if (journal->j_fc_cleanup_callback) 774 774 - journal->j_fc_cleanup_callback(journal, 0); 774 774 + journal->j_fc_cleanup_callback(journal, 0, tid); 775 775 write_lock(&journal->j_state_lock); 776 776 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; 777 777 if (fallback) ··· 1287 1287 1288 1288 /** 1289 1289 * jbd2_journal_shrink_scan() 1290 1290 + * @shrink: shrinker to work on 1291 1291 + * @sc: reclaim request to process 1290 1292 * 1291 1293 * Scan the checkpointed buffer on the checkpoint list and release the 1292 1294 * journal_head. ··· 1314 1312 1315 1313 /** 1316 1314 * jbd2_journal_shrink_count() 1315 1315 + * @shrink: shrinker to work on 1316 1316 + * @sc: reclaim request to process 1317 1317 * 1318 1318 * Count the number of checkpoint buffers on the checkpoint list. 1319 1319 */

+32 -21

fs/jbd2/transaction.c

reviewed

··· 449 449 } 450 450 451 451 /* OK, account for the buffers that this operation expects to 452 452 - * use and add the handle to the running transaction. 452 452 + * use and add the handle to the running transaction. 453 453 */ 454 454 update_t_max_wait(transaction, ts); 455 455 handle->h_transaction = transaction; ··· 836 836 } 837 837 EXPORT_SYMBOL(jbd2_journal_restart); 838 838 839 839 + /* 840 840 + * Waits for any outstanding t_updates to finish. 841 841 + * This is called with write j_state_lock held. 842 842 + */ 843 843 + void jbd2_journal_wait_updates(journal_t *journal) 844 844 + { 845 845 + transaction_t *commit_transaction = journal->j_running_transaction; 846 846 + 847 847 + if (!commit_transaction) 848 848 + return; 849 849 + 850 850 + spin_lock(&commit_transaction->t_handle_lock); 851 851 + while (atomic_read(&commit_transaction->t_updates)) { 852 852 + DEFINE_WAIT(wait); 853 853 + 854 854 + prepare_to_wait(&journal->j_wait_updates, &wait, 855 855 + TASK_UNINTERRUPTIBLE); 856 856 + if (atomic_read(&commit_transaction->t_updates)) { 857 857 + spin_unlock(&commit_transaction->t_handle_lock); 858 858 + write_unlock(&journal->j_state_lock); 859 859 + schedule(); 860 860 + write_lock(&journal->j_state_lock); 861 861 + spin_lock(&commit_transaction->t_handle_lock); 862 862 + } 863 863 + finish_wait(&journal->j_wait_updates, &wait); 864 864 + } 865 865 + spin_unlock(&commit_transaction->t_handle_lock); 866 866 + } 867 867 + 839 868 /** 840 869 * jbd2_journal_lock_updates () - establish a transaction barrier. 841 870 * @journal: Journal to establish a barrier on. ··· 892 863 write_lock(&journal->j_state_lock); 893 864 } 894 865 895 895 - /* Wait until there are no running updates */ 896 896 - while (1) { 897 897 - transaction_t *transaction = journal->j_running_transaction; 866 866 + /* Wait until there are no running t_updates */ 867 867 + jbd2_journal_wait_updates(journal); 898 868 899 899 - if (!transaction) 900 900 - break; 901 901 - 902 902 - spin_lock(&transaction->t_handle_lock); 903 903 - prepare_to_wait(&journal->j_wait_updates, &wait, 904 904 - TASK_UNINTERRUPTIBLE); 905 905 - if (!atomic_read(&transaction->t_updates)) { 906 906 - spin_unlock(&transaction->t_handle_lock); 907 907 - finish_wait(&journal->j_wait_updates, &wait); 908 908 - break; 909 909 - } 910 910 - spin_unlock(&transaction->t_handle_lock); 911 911 - write_unlock(&journal->j_state_lock); 912 912 - schedule(); 913 913 - finish_wait(&journal->j_wait_updates, &wait); 914 914 - write_lock(&journal->j_state_lock); 915 915 - } 916 869 write_unlock(&journal->j_state_lock); 917 870 918 871 /*

+4 -9

include/linux/jbd2.h

reviewed

··· 594 594 */ 595 595 unsigned long t_log_start; 596 596 597 597 - /* 597 597 + /* 598 598 * Number of buffers on the t_buffers list [j_list_lock, no locks 599 599 * needed for jbd2 thread] 600 600 */ ··· 1295 1295 * Clean-up after fast commit or full commit. JBD2 calls this function 1296 1296 * after every commit operation. 1297 1297 */ 1298 1298 - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); 1298 1298 + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); 1299 1299 1300 1300 /** 1301 1301 * @j_fc_replay_callback: ··· 1419 1419 extern bool __jbd2_journal_refile_buffer(struct journal_head *); 1420 1420 extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); 1421 1421 extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); 1422 1422 - extern void __journal_free_buffer(struct journal_head *bh); 1423 1422 extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); 1424 1424 - extern void __journal_clean_data_list(transaction_t *transaction); 1425 1423 static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh) 1426 1424 { 1427 1425 list_add_tail(&bh->b_assoc_buffers, head); ··· 1484 1486 struct buffer_head **bh_out, 1485 1487 sector_t blocknr); 1486 1488 1487 1487 - /* Transaction locking */ 1488 1488 - extern void __wait_on_journal (journal_t *); 1489 1489 - 1490 1489 /* Transaction cache support */ 1491 1490 extern void jbd2_journal_destroy_transaction_cache(void); 1492 1491 extern int __init jbd2_journal_init_transaction_cache(void); ··· 1537 1542 extern int jbd2_journal_flush(journal_t *journal, unsigned int flags); 1538 1543 extern void jbd2_journal_lock_updates (journal_t *); 1539 1544 extern void jbd2_journal_unlock_updates (journal_t *); 1545 1545 + 1546 1546 + void jbd2_journal_wait_updates(journal_t *); 1540 1547 1541 1548 extern journal_t * jbd2_journal_init_dev(struct block_device *bdev, 1542 1549 struct block_device *fs_dev, ··· 1770 1773 #define BJ_Shadow 3 /* Buffer contents being shadowed to the log */ 1771 1774 #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ 1772 1775 #define BJ_Types 5 1773 1773 - 1774 1774 - extern int jbd_blocks_per_page(struct inode *inode); 1775 1776 1776 1777 /* JBD uses a CRC32 checksum */ 1777 1778 #define JBD_MAX_CHECKSUM_SIZE 4