Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
"Improve performance for ext4 by allowing multiple process to perform
direct I/O writes to preallocated blocks by using a shared inode lock
instead of taking an exclusive lock.

In addition, multiple bug fixes and cleanups"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
ext4: fix incorrect options show of original mount_opt and extend mount_opt2
ext4: Fix possible corruption when moving a directory
ext4: init error handle resource before init group descriptors
ext4: fix task hung in ext4_xattr_delete_inode
jbd2: fix data missing when reusing bh which is ready to be checkpointed
ext4: update s_journal_inum if it changes after journal replay
ext4: fail ext4_iget if special inode unallocated
ext4: fix function prototype mismatch for ext4_feat_ktype
ext4: remove unnecessary variable initialization
ext4: fix inode tree inconsistency caused by ENOMEM
ext4: refuse to create ea block when umounted
ext4: optimize ea_inode block expansion
ext4: remove dead code in updating backup sb
ext4: dio take shared inode lock when overwriting preallocated blocks
ext4: don't show commit interval if it is zero
ext4: use ext4_fc_tl_mem in fast-commit replay path
ext4: improve xattr consistency checking and error reporting

Linus Torvalds 3 years ago b07ce43d ae3419fb

+247 -146

10 changed files

expand all collapse all

ext4

ext4.h

extents.c

fast_commit.c

file.c

inode.c

ioctl.c

namei.c

super.c

xattr.c

jbd2

transaction.c

fs/ext4/ext4.h

reviewed

··· 1529 1529 unsigned int s_mount_opt2; 1530 1530 unsigned long s_mount_flags; 1531 1531 unsigned int s_def_mount_opt; 1532 1532 + unsigned int s_def_mount_opt2; 1532 1533 ext4_fsblk_t s_sb_block; 1533 1534 atomic64_t s_resv_clusters; 1534 1535 kuid_t s_resuid;

+1 -1

fs/ext4/extents.c

reviewed

··· 3251 3251 ext4_ext_mark_unwritten(ex2); 3252 3252 3253 3253 err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); 3254 3254 - if (err != -ENOSPC && err != -EDQUOT) 3254 3254 + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) 3255 3255 goto out; 3256 3256 3257 3257 if (EXT4_EXT_MAY_ZEROOUT & split_flag) {

+26 -18

fs/ext4/fast_commit.c

reviewed

··· 1332 1332 char *dname; 1333 1333 }; 1334 1334 1335 1335 + /* Same as struct ext4_fc_tl, but uses native endianness fields */ 1336 1336 + struct ext4_fc_tl_mem { 1337 1337 + u16 fc_tag; 1338 1338 + u16 fc_len; 1339 1339 + }; 1340 1340 + 1335 1341 static inline void tl_to_darg(struct dentry_info_args *darg, 1336 1336 - struct ext4_fc_tl *tl, u8 *val) 1342 1342 + struct ext4_fc_tl_mem *tl, u8 *val) 1337 1343 { 1338 1344 struct ext4_fc_dentry_info fcd; 1339 1345 ··· 1351 1345 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1352 1346 } 1353 1347 1354 1354 - static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) 1348 1348 + static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) 1355 1349 { 1356 1356 - memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); 1357 1357 - tl->fc_len = le16_to_cpu(tl->fc_len); 1358 1358 - tl->fc_tag = le16_to_cpu(tl->fc_tag); 1350 1350 + struct ext4_fc_tl tl_disk; 1351 1351 + 1352 1352 + memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); 1353 1353 + tl->fc_len = le16_to_cpu(tl_disk.fc_len); 1354 1354 + tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); 1359 1355 } 1360 1356 1361 1357 /* Unlink replay function */ 1362 1362 - static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1363 1363 - u8 *val) 1358 1358 + static int ext4_fc_replay_unlink(struct super_block *sb, 1359 1359 + struct ext4_fc_tl_mem *tl, u8 *val) 1364 1360 { 1365 1361 struct inode *inode, *old_parent; 1366 1362 struct qstr entry; ··· 1459 1451 } 1460 1452 1461 1453 /* Link replay function */ 1462 1462 - static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1463 1463 - u8 *val) 1454 1454 + static int ext4_fc_replay_link(struct super_block *sb, 1455 1455 + struct ext4_fc_tl_mem *tl, u8 *val) 1464 1456 { 1465 1457 struct inode *inode; 1466 1458 struct dentry_info_args darg; ··· 1514 1506 /* 1515 1507 * Inode replay function 1516 1508 */ 1517 1517 - static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1518 1518 - u8 *val) 1509 1509 + static int ext4_fc_replay_inode(struct super_block *sb, 1510 1510 + struct ext4_fc_tl_mem *tl, u8 *val) 1519 1511 { 1520 1512 struct ext4_fc_inode fc_inode; 1521 1513 struct ext4_inode *raw_inode; ··· 1617 1609 * inode for which we are trying to create a dentry here, should already have 1618 1610 * been replayed before we start here. 1619 1611 */ 1620 1620 - static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1621 1621 - u8 *val) 1612 1612 + static int ext4_fc_replay_create(struct super_block *sb, 1613 1613 + struct ext4_fc_tl_mem *tl, u8 *val) 1622 1614 { 1623 1615 int ret = 0; 1624 1616 struct inode *inode = NULL; ··· 1716 1708 1717 1709 /* Replay add range tag */ 1718 1710 static int ext4_fc_replay_add_range(struct super_block *sb, 1719 1719 - struct ext4_fc_tl *tl, u8 *val) 1711 1711 + struct ext4_fc_tl_mem *tl, u8 *val) 1720 1712 { 1721 1713 struct ext4_fc_add_range fc_add_ex; 1722 1714 struct ext4_extent newex, *ex; ··· 1836 1828 1837 1829 /* Replay DEL_RANGE tag */ 1838 1830 static int 1839 1839 - ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1840 1840 - u8 *val) 1831 1831 + ext4_fc_replay_del_range(struct super_block *sb, 1832 1832 + struct ext4_fc_tl_mem *tl, u8 *val) 1841 1833 { 1842 1834 struct inode *inode; 1843 1835 struct ext4_fc_del_range lrange; ··· 2033 2025 struct ext4_fc_replay_state *state; 2034 2026 int ret = JBD2_FC_REPLAY_CONTINUE; 2035 2027 struct ext4_fc_add_range ext; 2036 2036 - struct ext4_fc_tl tl; 2028 2028 + struct ext4_fc_tl_mem tl; 2037 2029 struct ext4_fc_tail tail; 2038 2030 __u8 *start, *end, *cur, *val; 2039 2031 struct ext4_fc_head head; ··· 2152 2144 { 2153 2145 struct super_block *sb = journal->j_private; 2154 2146 struct ext4_sb_info *sbi = EXT4_SB(sb); 2155 2155 - struct ext4_fc_tl tl; 2147 2147 + struct ext4_fc_tl_mem tl; 2156 2148 __u8 *start, *end, *cur, *val; 2157 2149 int ret = JBD2_FC_REPLAY_CONTINUE; 2158 2150 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;

+22 -12

fs/ext4/file.c

reviewed

··· 202 202 return false; 203 203 } 204 204 205 205 - /* Is IO overwriting allocated and initialized blocks? */ 206 206 - static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) 205 205 + /* Is IO overwriting allocated or initialized blocks? */ 206 206 + static bool ext4_overwrite_io(struct inode *inode, 207 207 + loff_t pos, loff_t len, bool *unwritten) 207 208 { 208 209 struct ext4_map_blocks map; 209 210 unsigned int blkbits = inode->i_blkbits; ··· 218 217 blklen = map.m_len; 219 218 220 219 err = ext4_map_blocks(NULL, inode, &map, 0); 220 220 + if (err != blklen) 221 221 + return false; 221 222 /* 222 223 * 'err==len' means that all of the blocks have been preallocated, 223 223 - * regardless of whether they have been initialized or not. To exclude 224 224 - * unwritten extents, we need to check m_flags. 224 224 + * regardless of whether they have been initialized or not. We need to 225 225 + * check m_flags to distinguish the unwritten extents. 225 226 */ 226 226 - return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); 227 227 + *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); 228 228 + return true; 227 229 } 228 230 229 231 static ssize_t ext4_generic_write_checks(struct kiocb *iocb, ··· 435 431 * - For extending writes case we don't take the shared lock, since it requires 436 432 * updating inode i_disksize and/or orphan handling with exclusive lock. 437 433 * 438 438 - * - shared locking will only be true mostly with overwrites. Otherwise we will 439 439 - * switch to exclusive i_rwsem lock. 434 434 + * - shared locking will only be true mostly with overwrites, including 435 435 + * initialized blocks and unwritten blocks. For overwrite unwritten blocks 436 436 + * we protect splitting extents by i_data_sem in ext4_inode_info, so we can 437 437 + * also release exclusive i_rwsem lock. 438 438 + * 439 439 + * - Otherwise we will switch to exclusive i_rwsem lock. 440 440 */ 441 441 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, 442 442 - bool *ilock_shared, bool *extend) 442 442 + bool *ilock_shared, bool *extend, 443 443 + bool *unwritten) 443 444 { 444 445 struct file *file = iocb->ki_filp; 445 446 struct inode *inode = file_inode(file); ··· 468 459 * in file_modified(). 469 460 */ 470 461 if (*ilock_shared && (!IS_NOSEC(inode) || *extend || 471 471 - !ext4_overwrite_io(inode, offset, count))) { 462 462 + !ext4_overwrite_io(inode, offset, count, unwritten))) { 472 463 if (iocb->ki_flags & IOCB_NOWAIT) { 473 464 ret = -EAGAIN; 474 465 goto out; ··· 500 491 loff_t offset = iocb->ki_pos; 501 492 size_t count = iov_iter_count(from); 502 493 const struct iomap_ops *iomap_ops = &ext4_iomap_ops; 503 503 - bool extend = false, unaligned_io = false; 494 494 + bool extend = false, unaligned_io = false, unwritten = false; 504 495 bool ilock_shared = true; 505 496 506 497 /* ··· 543 534 return ext4_buffered_write_iter(iocb, from); 544 535 } 545 536 546 546 - ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); 537 537 + ret = ext4_dio_write_checks(iocb, from, 538 538 + &ilock_shared, &extend, &unwritten); 547 539 if (ret <= 0) 548 540 return ret; 549 541 ··· 592 582 ext4_journal_stop(handle); 593 583 } 594 584 595 595 - if (ilock_shared) 585 585 + if (ilock_shared && !unwritten) 596 586 iomap_ops = &ext4_iomap_overwrite_ops; 597 587 ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, 598 588 (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,

+9 -11

fs/ext4/inode.c

reviewed

··· 4872 4872 goto bad_inode; 4873 4873 raw_inode = ext4_raw_inode(&iloc); 4874 4874 4875 4875 - if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { 4876 4876 - ext4_error_inode(inode, function, line, 0, 4877 4877 - "iget: root inode unallocated"); 4878 4878 - ret = -EFSCORRUPTED; 4879 4879 - goto bad_inode; 4880 4880 - } 4881 4881 - 4882 4875 if ((flags & EXT4_IGET_HANDLE) && 4883 4876 (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { 4884 4877 ret = -ESTALE; ··· 4944 4951 * NeilBrown 1999oct15 4945 4952 */ 4946 4953 if (inode->i_nlink == 0) { 4947 4947 - if ((inode->i_mode == 0 || 4954 4954 + if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || 4948 4955 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && 4949 4956 ino != EXT4_BOOT_LOADER_INO) { 4950 4950 - /* this inode is deleted */ 4951 4951 - ret = -ESTALE; 4957 4957 + /* this inode is deleted or unallocated */ 4958 4958 + if (flags & EXT4_IGET_SPECIAL) { 4959 4959 + ext4_error_inode(inode, function, line, 0, 4960 4960 + "iget: special inode unallocated"); 4961 4961 + ret = -EFSCORRUPTED; 4962 4962 + } else 4963 4963 + ret = -ESTALE; 4952 4964 goto bad_inode; 4953 4965 } 4954 4966 /* The only unlinked inodes we let through here have ··· 5786 5788 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5787 5789 int gdpblocks; 5788 5790 int idxblocks; 5789 5789 - int ret = 0; 5791 5791 + int ret; 5790 5792 5791 5793 /* 5792 5794 * How many index blocks need to touch to map @lblocks logical blocks

-3

fs/ext4/ioctl.c

reviewed

··· 155 155 set_buffer_uptodate(bh); 156 156 unlock_buffer(bh); 157 157 158 158 - if (err) 159 159 - goto out_bh; 160 160 - 161 158 if (handle) { 162 159 err = ext4_handle_dirty_metadata(handle, NULL, bh); 163 160 if (err)

+10 -1

fs/ext4/namei.c

reviewed

··· 3872 3872 if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) 3873 3873 goto end_rename; 3874 3874 } 3875 3875 + /* 3876 3876 + * We need to protect against old.inode directory getting 3877 3877 + * converted from inline directory format into a normal one. 3878 3878 + */ 3879 3879 + inode_lock_nested(old.inode, I_MUTEX_NONDIR2); 3875 3880 retval = ext4_rename_dir_prepare(handle, &old); 3876 3876 - if (retval) 3881 3881 + if (retval) { 3882 3882 + inode_unlock(old.inode); 3877 3883 goto end_rename; 3884 3884 + } 3878 3885 } 3879 3886 /* 3880 3887 * If we're renaming a file within an inline_data dir and adding or ··· 4013 4006 } else { 4014 4007 ext4_journal_stop(handle); 4015 4008 } 4009 4009 + if (old.dir_bh) 4010 4010 + inode_unlock(old.inode); 4016 4011 release_bh: 4017 4012 brelse(old.dir_bh); 4018 4013 brelse(old.bh);

+34 -22

fs/ext4/super.c

reviewed

··· 2146 2146 return 0; 2147 2147 case Opt_commit: 2148 2148 if (result.uint_32 == 0) 2149 2149 - ctx->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE; 2149 2149 + result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; 2150 2150 else if (result.uint_32 > INT_MAX / HZ) { 2151 2151 ext4_msg(NULL, KERN_ERR, 2152 2152 "Invalid commit interval %d, " ··· 2883 2883 { 2884 2884 struct ext4_sb_info *sbi = EXT4_SB(sb); 2885 2885 struct ext4_super_block *es = sbi->s_es; 2886 2886 - int def_errors, def_mount_opt = sbi->s_def_mount_opt; 2886 2886 + int def_errors; 2887 2887 const struct mount_opts *m; 2888 2888 char sep = nodefs ? '\n' : ','; 2889 2889 ··· 2895 2895 2896 2896 for (m = ext4_mount_opts; m->token != Opt_err; m++) { 2897 2897 int want_set = m->flags & MOPT_SET; 2898 2898 + int opt_2 = m->flags & MOPT_2; 2899 2899 + unsigned int mount_opt, def_mount_opt; 2900 2900 + 2898 2901 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || 2899 2902 m->flags & MOPT_SKIP) 2900 2903 continue; 2901 2901 - if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) 2902 2902 - continue; /* skip if same as the default */ 2904 2904 + 2905 2905 + if (opt_2) { 2906 2906 + mount_opt = sbi->s_mount_opt2; 2907 2907 + def_mount_opt = sbi->s_def_mount_opt2; 2908 2908 + } else { 2909 2909 + mount_opt = sbi->s_mount_opt; 2910 2910 + def_mount_opt = sbi->s_def_mount_opt; 2911 2911 + } 2912 2912 + /* skip if same as the default */ 2913 2913 + if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) 2914 2914 + continue; 2915 2915 + /* select Opt_noFoo vs Opt_Foo */ 2903 2916 if ((want_set && 2904 2904 - (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || 2905 2905 - (!want_set && (sbi->s_mount_opt & m->mount_opt))) 2906 2906 - continue; /* select Opt_noFoo vs Opt_Foo */ 2917 2917 + (mount_opt & m->mount_opt) != m->mount_opt) || 2918 2918 + (!want_set && (mount_opt & m->mount_opt))) 2919 2919 + continue; 2907 2920 SEQ_OPTS_PRINT("%s", token2str(m->token)); 2908 2921 } 2909 2922 ··· 2944 2931 if (nodefs || sbi->s_stripe) 2945 2932 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); 2946 2933 if (nodefs || EXT4_MOUNT_DATA_FLAGS & 2947 2947 - (sbi->s_mount_opt ^ def_mount_opt)) { 2934 2934 + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { 2948 2935 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 2949 2936 SEQ_OPTS_PUTS("data=journal"); 2950 2937 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) ··· 4740 4727 struct ext4_sb_info *sbi = EXT4_SB(sb); 4741 4728 unsigned int db_count; 4742 4729 ext4_fsblk_t block; 4743 4743 - int ret; 4744 4730 int i; 4745 4731 4746 4732 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / ··· 4779 4767 ext4_msg(sb, KERN_ERR, 4780 4768 "can't read group descriptor %d", i); 4781 4769 sbi->s_gdb_count = i; 4782 4782 - ret = PTR_ERR(bh); 4783 4783 - goto out; 4770 4770 + return PTR_ERR(bh); 4784 4771 } 4785 4772 rcu_read_lock(); 4786 4773 rcu_dereference(sbi->s_group_desc)[i] = bh; ··· 4788 4777 sbi->s_gdb_count = db_count; 4789 4778 if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { 4790 4779 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 4791 4791 - ret = -EFSCORRUPTED; 4792 4792 - goto out; 4780 4780 + return -EFSCORRUPTED; 4793 4781 } 4782 4782 + 4794 4783 return 0; 4795 4795 - out: 4796 4796 - ext4_group_desc_free(sbi); 4797 4797 - return ret; 4798 4784 } 4799 4785 4800 4786 static int ext4_load_and_init_journal(struct super_block *sb, ··· 5083 5075 goto failed_mount; 5084 5076 5085 5077 sbi->s_def_mount_opt = sbi->s_mount_opt; 5078 5078 + sbi->s_def_mount_opt2 = sbi->s_mount_opt2; 5086 5079 5087 5080 err = ext4_check_opt_consistency(fc, sb); 5088 5081 if (err < 0) ··· 5218 5209 if (ext4_geometry_check(sb, es)) 5219 5210 goto failed_mount; 5220 5211 5221 5221 - err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); 5222 5222 - if (err) 5223 5223 - goto failed_mount; 5224 5224 - 5225 5212 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); 5226 5213 spin_lock_init(&sbi->s_error_lock); 5227 5214 INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); 5215 5215 + 5216 5216 + err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); 5217 5217 + if (err) 5218 5218 + goto failed_mount3; 5228 5219 5229 5220 /* Register extent status tree shrinker */ 5230 5221 if (ext4_es_register_shrinker(sbi)) ··· 5946 5937 if (!really_read_only && journal_devnum && 5947 5938 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 5948 5939 es->s_journal_dev = cpu_to_le32(journal_devnum); 5949 5949 - 5950 5950 - /* Make sure we flush the recovery flag to disk. */ 5940 5940 + ext4_commit_super(sb); 5941 5941 + } 5942 5942 + if (!really_read_only && journal_inum && 5943 5943 + journal_inum != le32_to_cpu(es->s_journal_inum)) { 5944 5944 + es->s_journal_inum = cpu_to_le32(journal_inum); 5951 5945 ext4_commit_super(sb); 5952 5946 } 5953 5947

+115 -57

fs/ext4/xattr.c

reviewed

··· 184 184 } 185 185 186 186 static int 187 187 - ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, 188 188 - void *value_start) 187 187 + check_xattrs(struct inode *inode, struct buffer_head *bh, 188 188 + struct ext4_xattr_entry *entry, void *end, void *value_start, 189 189 + const char *function, unsigned int line) 189 190 { 190 191 struct ext4_xattr_entry *e = entry; 192 192 + int err = -EFSCORRUPTED; 193 193 + char *err_str; 194 194 + 195 195 + if (bh) { 196 196 + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 197 197 + BHDR(bh)->h_blocks != cpu_to_le32(1)) { 198 198 + err_str = "invalid header"; 199 199 + goto errout; 200 200 + } 201 201 + if (buffer_verified(bh)) 202 202 + return 0; 203 203 + if (!ext4_xattr_block_csum_verify(inode, bh)) { 204 204 + err = -EFSBADCRC; 205 205 + err_str = "invalid checksum"; 206 206 + goto errout; 207 207 + } 208 208 + } else { 209 209 + struct ext4_xattr_ibody_header *header = value_start; 210 210 + 211 211 + header -= 1; 212 212 + if (end - (void *)header < sizeof(*header) + sizeof(u32)) { 213 213 + err_str = "in-inode xattr block too small"; 214 214 + goto errout; 215 215 + } 216 216 + if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 217 217 + err_str = "bad magic number in in-inode xattr"; 218 218 + goto errout; 219 219 + } 220 220 + } 191 221 192 222 /* Find the end of the names list */ 193 223 while (!IS_LAST_ENTRY(e)) { 194 224 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); 195 195 - if ((void *)next >= end) 196 196 - return -EFSCORRUPTED; 197 197 - if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) 198 198 - return -EFSCORRUPTED; 225 225 + if ((void *)next >= end) { 226 226 + err_str = "e_name out of bounds"; 227 227 + goto errout; 228 228 + } 229 229 + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { 230 230 + err_str = "bad e_name length"; 231 231 + goto errout; 232 232 + } 199 233 e = next; 200 234 } 201 235 202 236 /* Check the values */ 203 237 while (!IS_LAST_ENTRY(entry)) { 204 238 u32 size = le32_to_cpu(entry->e_value_size); 239 239 + unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); 205 240 206 206 - if (size > EXT4_XATTR_SIZE_MAX) 207 207 - return -EFSCORRUPTED; 241 241 + if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { 242 242 + err_str = "ea_inode specified without ea_inode feature enabled"; 243 243 + goto errout; 244 244 + } 245 245 + if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || 246 246 + !ext4_valid_inum(inode->i_sb, ea_ino))) { 247 247 + err_str = "invalid ea_ino"; 248 248 + goto errout; 249 249 + } 250 250 + if (size > EXT4_XATTR_SIZE_MAX) { 251 251 + err_str = "e_value size too large"; 252 252 + goto errout; 253 253 + } 208 254 209 255 if (size != 0 && entry->e_value_inum == 0) { 210 256 u16 offs = le16_to_cpu(entry->e_value_offs); ··· 262 216 * the padded and unpadded sizes, since the size may 263 217 * overflow to 0 when adding padding. 264 218 */ 265 265 - if (offs > end - value_start) 266 266 - return -EFSCORRUPTED; 219 219 + if (offs > end - value_start) { 220 220 + err_str = "e_value out of bounds"; 221 221 + goto errout; 222 222 + } 267 223 value = value_start + offs; 268 224 if (value < (void *)e + sizeof(u32) || 269 225 size > end - value || 270 270 - EXT4_XATTR_SIZE(size) > end - value) 271 271 - return -EFSCORRUPTED; 226 226 + EXT4_XATTR_SIZE(size) > end - value) { 227 227 + err_str = "overlapping e_value "; 228 228 + goto errout; 229 229 + } 272 230 } 273 231 entry = EXT4_XATTR_NEXT(entry); 274 232 } 275 275 - 233 233 + if (bh) 234 234 + set_buffer_verified(bh); 276 235 return 0; 236 236 + 237 237 + errout: 238 238 + if (bh) 239 239 + __ext4_error_inode(inode, function, line, 0, -err, 240 240 + "corrupted xattr block %llu: %s", 241 241 + (unsigned long long) bh->b_blocknr, 242 242 + err_str); 243 243 + else 244 244 + __ext4_error_inode(inode, function, line, 0, -err, 245 245 + "corrupted in-inode xattr: %s", err_str); 246 246 + return err; 277 247 } 278 248 279 249 static inline int 280 250 __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, 281 251 const char *function, unsigned int line) 282 252 { 283 283 - int error = -EFSCORRUPTED; 284 284 - 285 285 - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 286 286 - BHDR(bh)->h_blocks != cpu_to_le32(1)) 287 287 - goto errout; 288 288 - if (buffer_verified(bh)) 289 289 - return 0; 290 290 - 291 291 - error = -EFSBADCRC; 292 292 - if (!ext4_xattr_block_csum_verify(inode, bh)) 293 293 - goto errout; 294 294 - error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, 295 295 - bh->b_data); 296 296 - errout: 297 297 - if (error) 298 298 - __ext4_error_inode(inode, function, line, 0, -error, 299 299 - "corrupted xattr block %llu", 300 300 - (unsigned long long) bh->b_blocknr); 301 301 - else 302 302 - set_buffer_verified(bh); 303 303 - return error; 253 253 + return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, 254 254 + bh->b_data, function, line); 304 255 } 305 256 306 257 #define ext4_xattr_check_block(inode, bh) \ 307 258 __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) 308 259 309 260 310 310 - static int 261 261 + static inline int 311 262 __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, 312 263 void *end, const char *function, unsigned int line) 313 264 { 314 314 - int error = -EFSCORRUPTED; 315 315 - 316 316 - if (end - (void *)header < sizeof(*header) + sizeof(u32) || 317 317 - (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) 318 318 - goto errout; 319 319 - error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); 320 320 - errout: 321 321 - if (error) 322 322 - __ext4_error_inode(inode, function, line, 0, -error, 323 323 - "corrupted in-inode xattr"); 324 324 - return error; 265 265 + return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), 266 266 + function, line); 325 267 } 326 268 327 269 #define xattr_check_inode(inode, header, end) \ ··· 421 387 { 422 388 struct inode *inode; 423 389 int err; 390 390 + 391 391 + /* 392 392 + * We have to check for this corruption early as otherwise 393 393 + * iget_locked() could wait indefinitely for the state of our 394 394 + * parent inode. 395 395 + */ 396 396 + if (parent->i_ino == ea_ino) { 397 397 + ext4_error(parent->i_sb, 398 398 + "Parent and EA inode have the same ino %lu", ea_ino); 399 399 + return -EFSCORRUPTED; 400 400 + } 424 401 425 402 inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); 426 403 if (IS_ERR(inode)) { ··· 1482 1437 struct inode *ea_inode = NULL; 1483 1438 uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; 1484 1439 int err; 1440 1440 + 1441 1441 + if (inode->i_sb->s_root == NULL) { 1442 1442 + ext4_warning(inode->i_sb, 1443 1443 + "refuse to create EA inode when umounting"); 1444 1444 + WARN_ON(1); 1445 1445 + return ERR_PTR(-EINVAL); 1446 1446 + } 1485 1447 1486 1448 /* 1487 1449 * Let the next inode be the goal, so we try and allocate the EA inode ··· 2619 2567 2620 2568 is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); 2621 2569 bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); 2622 2622 - buffer = kvmalloc(value_size, GFP_NOFS); 2623 2570 b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); 2624 2624 - if (!is || !bs || !buffer || !b_entry_name) { 2571 2571 + if (!is || !bs || !b_entry_name) { 2625 2572 error = -ENOMEM; 2626 2573 goto out; 2627 2574 } ··· 2632 2581 2633 2582 /* Save the entry name and the entry value */ 2634 2583 if (entry->e_value_inum) { 2584 2584 + buffer = kvmalloc(value_size, GFP_NOFS); 2585 2585 + if (!buffer) { 2586 2586 + error = -ENOMEM; 2587 2587 + goto out; 2588 2588 + } 2589 2589 + 2635 2590 error = ext4_xattr_inode_get(inode, entry, buffer, value_size); 2636 2591 if (error) 2637 2592 goto out; 2638 2593 } else { 2639 2594 size_t value_offs = le16_to_cpu(entry->e_value_offs); 2640 2640 - memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); 2595 2595 + buffer = (void *)IFIRST(header) + value_offs; 2641 2596 } 2642 2597 2643 2598 memcpy(b_entry_name, entry->e_name, entry->e_name_len); ··· 2658 2601 if (error) 2659 2602 goto out; 2660 2603 2661 2661 - /* Remove the chosen entry from the inode */ 2662 2662 - error = ext4_xattr_ibody_set(handle, inode, &i, is); 2663 2663 - if (error) 2664 2664 - goto out; 2665 2665 - 2666 2604 i.value = buffer; 2667 2605 i.value_len = value_size; 2668 2606 error = ext4_xattr_block_find(inode, &i, bs); 2669 2607 if (error) 2670 2608 goto out; 2671 2609 2672 2672 - /* Add entry which was removed from the inode into the block */ 2610 2610 + /* Move ea entry from the inode into the block */ 2673 2611 error = ext4_xattr_block_set(handle, inode, &i, bs); 2674 2612 if (error) 2675 2613 goto out; 2676 2676 - error = 0; 2614 2614 + 2615 2615 + /* Remove the chosen entry from the inode */ 2616 2616 + i.value = NULL; 2617 2617 + i.value_len = 0; 2618 2618 + error = ext4_xattr_ibody_set(handle, inode, &i, is); 2619 2619 + 2677 2620 out: 2678 2621 kfree(b_entry_name); 2679 2679 - kvfree(buffer); 2622 2622 + if (entry->e_value_inum && buffer) 2623 2623 + kvfree(buffer); 2680 2624 if (is) 2681 2625 brelse(is->iloc.bh); 2682 2626 if (bs)

+29 -21

fs/jbd2/transaction.c

reviewed

··· 1010 1010 * ie. locked but not dirty) or tune2fs (which may actually have 1011 1011 * the buffer dirtied, ugh.) */ 1012 1012 1013 1013 - if (buffer_dirty(bh)) { 1013 1013 + if (buffer_dirty(bh) && jh->b_transaction) { 1014 1014 + warn_dirty_buffer(bh); 1014 1015 /* 1015 1015 - * First question: is this buffer already part of the current 1016 1016 - * transaction or the existing committing transaction? 1017 1017 - */ 1018 1018 - if (jh->b_transaction) { 1019 1019 - J_ASSERT_JH(jh, 1020 1020 - jh->b_transaction == transaction || 1021 1021 - jh->b_transaction == 1022 1022 - journal->j_committing_transaction); 1023 1023 - if (jh->b_next_transaction) 1024 1024 - J_ASSERT_JH(jh, jh->b_next_transaction == 1025 1025 - transaction); 1026 1026 - warn_dirty_buffer(bh); 1027 1027 - } 1028 1028 - /* 1029 1029 - * In any case we need to clean the dirty flag and we must 1030 1030 - * do it under the buffer lock to be sure we don't race 1031 1031 - * with running write-out. 1016 1016 + * We need to clean the dirty flag and we must do it under the 1017 1017 + * buffer lock to be sure we don't race with running write-out. 1032 1018 */ 1033 1019 JBUFFER_TRACE(jh, "Journalling dirty buffer"); 1034 1020 clear_buffer_dirty(bh); 1021 1021 + /* 1022 1022 + * The buffer is going to be added to BJ_Reserved list now and 1023 1023 + * nothing guarantees jbd2_journal_dirty_metadata() will be 1024 1024 + * ever called for it. So we need to set jbddirty bit here to 1025 1025 + * make sure the buffer is dirtied and written out when the 1026 1026 + * journaling machinery is done with it. 1027 1027 + */ 1035 1028 set_buffer_jbddirty(bh); 1036 1029 } 1037 1037 - 1038 1038 - unlock_buffer(bh); 1039 1030 1040 1031 error = -EROFS; 1041 1032 if (is_handle_aborted(handle)) { 1042 1033 spin_unlock(&jh->b_state_lock); 1034 1034 + unlock_buffer(bh); 1043 1035 goto out; 1044 1036 } 1045 1037 error = 0; ··· 1041 1049 * b_next_transaction points to it 1042 1050 */ 1043 1051 if (jh->b_transaction == transaction || 1044 1044 - jh->b_next_transaction == transaction) 1052 1052 + jh->b_next_transaction == transaction) { 1053 1053 + unlock_buffer(bh); 1045 1054 goto done; 1055 1055 + } 1046 1056 1047 1057 /* 1048 1058 * this is the first time this transaction is touching this buffer, ··· 1068 1074 */ 1069 1075 smp_wmb(); 1070 1076 spin_lock(&journal->j_list_lock); 1077 1077 + if (test_clear_buffer_dirty(bh)) { 1078 1078 + /* 1079 1079 + * Execute buffer dirty clearing and jh->b_transaction 1080 1080 + * assignment under journal->j_list_lock locked to 1081 1081 + * prevent bh being removed from checkpoint list if 1082 1082 + * the buffer is in an intermediate state (not dirty 1083 1083 + * and jh->b_transaction is NULL). 1084 1084 + */ 1085 1085 + JBUFFER_TRACE(jh, "Journalling dirty buffer"); 1086 1086 + set_buffer_jbddirty(bh); 1087 1087 + } 1071 1088 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 1072 1089 spin_unlock(&journal->j_list_lock); 1090 1090 + unlock_buffer(bh); 1073 1091 goto done; 1074 1092 } 1093 1093 + unlock_buffer(bh); 1094 1094 + 1075 1095 /* 1076 1096 * If there is already a copy-out version of this buffer, then we don't 1077 1097 * need to make another one