Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+1 -1

Documentation/conf.py

··· 34 34 # Add any Sphinx extension module names here, as strings. They can be 35 35 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 36 # ones. 37 - extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure'] 37 + extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig'] 38 38 39 39 # The name of the math extension changed on Sphinx 1.4 40 40 if major == 1 and minor > 3:

+66 -80

Documentation/filesystems/ext4.txt Documentation/filesystems/ext4/ext4.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 1 2 2 - Ext4 Filesystem 3 - =============== 3 + ======================== 4 + General Information 5 + ======================== 4 6 5 7 Ext4 is an advanced level of the ext3 filesystem which incorporates 6 8 scalability and reliability enhancements for supporting large filesystems ··· 13 11 Web site: http://ext4.wiki.kernel.org 14 12 15 13 16 - 1. Quick usage instructions: 17 - =========================== 14 + Quick usage instructions 15 + ======================== 18 16 19 17 Note: More extensive information for getting started with ext4 can be 20 - found at the ext4 wiki site at the URL: 21 - http://ext4.wiki.kernel.org/index.php/Ext4_Howto 18 + found at the ext4 wiki site at the URL: 19 + http://ext4.wiki.kernel.org/index.php/Ext4_Howto 22 20 23 - - Compile and install the latest version of e2fsprogs (as of this 24 - writing version 1.41.3) from: 25 - 26 - http://sourceforge.net/project/showfiles.php?group_id=2406 27 - 28 - or 21 + - The latest version of e2fsprogs can be found at: 29 22 30 23 https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ 31 24 25 + or 26 + 27 + http://sourceforge.net/project/showfiles.php?group_id=2406 28 + 32 29 or grab the latest git repository from: 33 30 34 - git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git 35 - 36 - - Note that it is highly important to install the mke2fs.conf file 37 - that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If 38 - you have edited the /etc/mke2fs.conf file installed on your system, 39 - you will need to merge your changes with the version from e2fsprogs 40 - 1.41.x. 31 + https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git 41 32 42 33 - Create a new filesystem using the ext4 filesystem type: 43 34 44 - # mke2fs -t ext4 /dev/hda1 35 + # mke2fs -t ext4 /dev/hda1 45 36 46 - Or to configure an existing ext3 filesystem to support extents: 37 + Or to configure an existing ext3 filesystem to support extents: 47 38 48 39 # tune2fs -O extents /dev/hda1 49 40 ··· 44 49 converted to use 256 byte for greater efficiency via: 45 50 46 51 # tune2fs -I 256 /dev/hda1 47 - 48 - (Note: we currently do not have tools to convert an ext4 49 - filesystem back to ext3; so please do not do try this on production 50 - filesystems.) 51 52 52 53 - Mounting: 53 54 ··· 66 75 the filesystem with a large journal can also be helpful for 67 76 metadata-intensive workloads. 68 77 69 - 2. Features 70 - =========== 78 + Features 79 + ======== 71 80 72 - 2.1 Currently available 81 + Currently Available 82 + ------------------- 73 83 74 84 * ability to use filesystems > 16TB (e2fsprogs support not available yet) 75 85 * extent format reduces metadata overhead (RAM, IO for access, transactions) ··· 95 103 [1] Filesystems with a block size of 1k may see a limit imposed by the 96 104 directory hash tree having a maximum depth of two. 97 105 98 - 2.2 Candidate features for future inclusion 99 - 100 - * online defrag (patches available but not well tested) 101 - * reduced mke2fs time via lazy itable initialization in conjunction with 102 - the uninit_bg feature (capability to do this is available in e2fsprogs 103 - but a kernel thread to do lazy zeroing of unused inode table blocks 104 - after filesystem is first mounted is required for safety) 105 - 106 - There are several others under discussion, whether they all make it in is 107 - partly a function of how much time everyone has to work on them. Features like 108 - metadata checksumming have been discussed and planned for a bit but no patches 109 - exist yet so I'm not sure they're in the near-term roadmap. 110 - 111 - The big performance win will come with mballoc, delalloc and flex_bg 112 - grouping of bitmaps and inode tables. Some test results available here: 113 - 114 - - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html 115 - - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html 116 - 117 - 3. Options 118 - ========== 106 + Options 107 + ======= 119 108 120 109 When mounting an ext4 filesystem, the following option are accepted: 121 110 (*) == default 122 111 112 + ======================= ======================================================= 113 + Mount Option Description 114 + ======================= ======================================================= 123 115 ro Mount filesystem read only. Note that ext4 will 124 116 replay the journal (and thus write to the 125 117 partition) even when mounted "read only". The ··· 363 387 dax Use direct access (no page cache). See 364 388 Documentation/filesystems/dax.txt. Note that 365 389 this option is incompatible with data=journal. 390 + ======================= ======================================================= 366 391 367 392 Data Mode 368 393 ========= 369 394 There are 3 different data modes: 370 395 371 396 * writeback mode 372 - In data=writeback mode, ext4 does not journal data at all. This mode provides 373 - a similar level of journaling as that of XFS, JFS, and ReiserFS in its default 374 - mode - metadata journaling. A crash+recovery can cause incorrect data to 375 - appear in files which were written shortly before the crash. This mode will 376 - typically provide the best ext4 performance. 397 + 398 + In data=writeback mode, ext4 does not journal data at all. This mode provides 399 + a similar level of journaling as that of XFS, JFS, and ReiserFS in its default 400 + mode - metadata journaling. A crash+recovery can cause incorrect data to 401 + appear in files which were written shortly before the crash. This mode will 402 + typically provide the best ext4 performance. 377 403 378 404 * ordered mode 379 - In data=ordered mode, ext4 only officially journals metadata, but it logically 380 - groups metadata information related to data changes with the data blocks into a 381 - single unit called a transaction. When it's time to write the new metadata 382 - out to disk, the associated data blocks are written first. In general, 383 - this mode performs slightly slower than writeback but significantly faster than journal mode. 405 + 406 + In data=ordered mode, ext4 only officially journals metadata, but it logically 407 + groups metadata information related to data changes with the data blocks into 408 + a single unit called a transaction. When it's time to write the new metadata 409 + out to disk, the associated data blocks are written first. In general, this 410 + mode performs slightly slower than writeback but significantly faster than 411 + journal mode. 384 412 385 413 * journal mode 386 - data=journal mode provides full data and metadata journaling. All new data is 387 - written to the journal first, and then to its final location. 388 - In the event of a crash, the journal can be replayed, bringing both data and 389 - metadata into a consistent state. This mode is the slowest except when data 390 - needs to be read from and written to disk at the same time where it 391 - outperforms all others modes. Enabling this mode will disable delayed 392 - allocation and O_DIRECT support. 414 + 415 + data=journal mode provides full data and metadata journaling. All new data is 416 + written to the journal first, and then to its final location. In the event of 417 + a crash, the journal can be replayed, bringing both data and metadata into a 418 + consistent state. This mode is the slowest except when data needs to be read 419 + from and written to disk at the same time where it outperforms all others 420 + modes. Enabling this mode will disable delayed allocation and O_DIRECT 421 + support. 393 422 394 423 /proc entries 395 424 ============= ··· 406 425 in table below. 407 426 408 427 Files in /proc/fs/ext4/<devname> 409 - .............................................................................. 428 + 429 + ================ ======= 410 430 File Content 431 + ================ ======= 411 432 mb_groups details of multiblock allocator buddy cache of free blocks 412 - .............................................................................. 433 + ================ ======= 413 434 414 435 /sys entries 415 436 ============ ··· 422 439 /sys/fs/ext4/dm-0). The files in each per-device directory are shown 423 440 in table below. 424 441 425 - Files in /sys/fs/ext4/<devname> 426 - (see also Documentation/ABI/testing/sysfs-fs-ext4) 427 - .............................................................................. 428 - File Content 442 + Files in /sys/fs/ext4/<devname>: 429 443 444 + (see also Documentation/ABI/testing/sysfs-fs-ext4) 445 + 446 + ============================= ================================================= 447 + File Content 448 + ============================= ================================================= 430 449 delayed_allocation_blocks This file is read-only and shows the number of 431 450 blocks that are dirty in the page cache, but 432 451 which do not have their location in the 433 452 filesystem allocated yet. 434 453 435 - inode_goal Tuning parameter which (if non-zero) controls 454 + inode_goal Tuning parameter which (if non-zero) controls 436 455 the goal inode used by the inode allocator in 437 456 preference to all other allocation heuristics. 438 457 This is intended for debugging use only, and 439 458 should be 0 on production systems. 440 459 441 - inode_readahead_blks Tuning parameter which controls the maximum 460 + inode_readahead_blks Tuning parameter which controls the maximum 442 461 number of inode table blocks that ext4's inode 443 462 table readahead algorithm will pre-read into 444 463 the buffer cache 445 464 446 - lifetime_write_kbytes This file is read-only and shows the number of 465 + lifetime_write_kbytes This file is read-only and shows the number of 447 466 kilobytes of data that have been written to this 448 467 filesystem since it was created. 449 468 ··· 493 508 in the file system. If there is not enough space 494 509 for the reserved space when mounting the file 495 510 mount will _not_ fail. 496 - .............................................................................. 511 + ============================= ================================================= 497 512 498 513 Ioctls 499 514 ====== ··· 503 518 shown in the table below. 504 519 505 520 Table of Ext4 specific ioctls 506 - .............................................................................. 507 - Ioctl Description 521 + 522 + ============================= ================================================= 523 + Ioctl Description 524 + ============================= ================================================= 508 525 EXT4_IOC_GETFLAGS Get additional attributes associated with inode. 509 526 The ioctl argument is an integer bitfield, with 510 527 bit values described in ext4.h. This ioctl is an ··· 597 610 normal user by accident. 598 611 The data blocks of the previous boot loader 599 612 will be associated with the given inode. 600 - 601 - .............................................................................. 613 + ============================= ================================================= 602 614 603 615 References 604 616 ==========

+17

Documentation/filesystems/ext4/index.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + =============== 4 + ext4 Filesystem 5 + =============== 6 + 7 + General usage and on-disk artifacts writen by ext4. More documentation may 8 + be ported from the wiki as time permits. This should be considered the 9 + canonical source of information as the details here have been reviewed by 10 + the ext4 community. 11 + 12 + .. toctree:: 13 + :maxdepth: 5 14 + :numbered: 15 + 16 + ext4 17 + ondisk/index

+44

Documentation/filesystems/ext4/ondisk/about.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + About this Book 4 + =============== 5 + 6 + This document attempts to describe the on-disk format for ext4 7 + filesystems. The same general ideas should apply to ext2/3 filesystems 8 + as well, though they do not support all the features that ext4 supports, 9 + and the fields will be shorter. 10 + 11 + **NOTE**: This is a work in progress, based on notes that the author 12 + (djwong) made while picking apart a filesystem by hand. The data 13 + structure definitions should be current as of Linux 4.18 and 14 + e2fsprogs-1.44. All comments and corrections are welcome, since there is 15 + undoubtedly plenty of lore that might not be reflected in freshly 16 + created demonstration filesystems. 17 + 18 + License 19 + ------- 20 + This book is licensed under the terms of the GNU Public License, v2. 21 + 22 + Terminology 23 + ----------- 24 + 25 + ext4 divides a storage device into an array of logical blocks both to 26 + reduce bookkeeping overhead and to increase throughput by forcing larger 27 + transfer sizes. Generally, the block size will be 4KiB (the same size as 28 + pages on x86 and the block layer's default block size), though the 29 + actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes. 30 + Throughout this document, disk locations are given in terms of these 31 + logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of 32 + convenience, the logical block size will be referred to as 33 + ``$block_size`` throughout the rest of the document. 34 + 35 + When referenced in ``preformatted text`` blocks, ``sb`` refers to fields 36 + in the super block, and ``inode`` refers to fields in an inode table 37 + entry. 38 + 39 + Other References 40 + ---------------- 41 + 42 + Also see http://www.nongnu.org/ext2-doc/ for quite a collection of 43 + information about ext2/3. Here's another old reference: 44 + http://wiki.osdev.org/Ext2

+56

Documentation/filesystems/ext4/ondisk/allocators.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Block and Inode Allocation Policy 4 + --------------------------------- 5 + 6 + ext4 recognizes (better than ext3, anyway) that data locality is 7 + generally a desirably quality of a filesystem. On a spinning disk, 8 + keeping related blocks near each other reduces the amount of movement 9 + that the head actuator and disk must perform to access a data block, 10 + thus speeding up disk IO. On an SSD there of course are no moving parts, 11 + but locality can increase the size of each transfer request while 12 + reducing the total number of requests. This locality may also have the 13 + effect of concentrating writes on a single erase block, which can speed 14 + up file rewrites significantly. Therefore, it is useful to reduce 15 + fragmentation whenever possible. 16 + 17 + The first tool that ext4 uses to combat fragmentation is the multi-block 18 + allocator. When a file is first created, the block allocator 19 + speculatively allocates 8KiB of disk space to the file on the assumption 20 + that the space will get written soon. When the file is closed, the 21 + unused speculative allocations are of course freed, but if the 22 + speculation is correct (typically the case for full writes of small 23 + files) then the file data gets written out in a single multi-block 24 + extent. A second related trick that ext4 uses is delayed allocation. 25 + Under this scheme, when a file needs more blocks to absorb file writes, 26 + the filesystem defers deciding the exact placement on the disk until all 27 + the dirty buffers are being written out to disk. By not committing to a 28 + particular placement until it's absolutely necessary (the commit timeout 29 + is hit, or sync() is called, or the kernel runs out of memory), the hope 30 + is that the filesystem can make better location decisions. 31 + 32 + The third trick that ext4 (and ext3) uses is that it tries to keep a 33 + file's data blocks in the same block group as its inode. This cuts down 34 + on the seek penalty when the filesystem first has to read a file's inode 35 + to learn where the file's data blocks live and then seek over to the 36 + file's data blocks to begin I/O operations. 37 + 38 + The fourth trick is that all the inodes in a directory are placed in the 39 + same block group as the directory, when feasible. The working assumption 40 + here is that all the files in a directory might be related, therefore it 41 + is useful to try to keep them all together. 42 + 43 + The fifth trick is that the disk volume is cut up into 128MB block 44 + groups; these mini-containers are used as outlined above to try to 45 + maintain data locality. However, there is a deliberate quirk -- when a 46 + directory is created in the root directory, the inode allocator scans 47 + the block groups and puts that directory into the least heavily loaded 48 + block group that it can find. This encourages directories to spread out 49 + over a disk; as the top-level directory/file blobs fill up one block 50 + group, the allocators simply move on to the next block group. Allegedly 51 + this scheme evens out the loading on the block groups, though the author 52 + suspects that the directories which are so unlucky as to land towards 53 + the end of a spinning drive get a raw deal performance-wise. 54 + 55 + Of course if all of these mechanisms fail, one can always use e4defrag 56 + to defragment files.

+191

Documentation/filesystems/ext4/ondisk/attributes.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Extended Attributes 4 + ------------------- 5 + 6 + Extended attributes (xattrs) are typically stored in a separate data 7 + block on the disk and referenced from inodes via ``inode.i_file_acl*``. 8 + The first use of extended attributes seems to have been for storing file 9 + ACLs and other security data (selinux). With the ``user_xattr`` mount 10 + option it is possible for users to store extended attributes so long as 11 + all attribute names begin with “user”; this restriction seems to have 12 + disappeared as of Linux 3.0. 13 + 14 + There are two places where extended attributes can be found. The first 15 + place is between the end of each inode entry and the beginning of the 16 + next inode entry. For example, if inode.i\_extra\_isize = 28 and 17 + sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes 18 + available for in-inode extended attribute storage. The second place 19 + where extended attributes can be found is in the block pointed to by 20 + ``inode.i_file_acl``. As of Linux 3.11, it is not possible for this 21 + block to contain a pointer to a second extended attribute block (or even 22 + the remaining blocks of a cluster). In theory it is possible for each 23 + attribute's value to be stored in a separate data block, though as of 24 + Linux 3.11 the code does not permit this. 25 + 26 + Keys are generally assumed to be ASCIIZ strings, whereas values can be 27 + strings or binary data. 28 + 29 + Extended attributes, when stored after the inode, have a header 30 + ``ext4_xattr_ibody_header`` that is 4 bytes long: 31 + 32 + .. list-table:: 33 + :widths: 1 1 1 77 34 + :header-rows: 1 35 + 36 + * - Offset 37 + - Type 38 + - Name 39 + - Description 40 + * - 0x0 41 + - \_\_le32 42 + - h\_magic 43 + - Magic number for identification, 0xEA020000. This value is set by the 44 + Linux driver, though e2fsprogs doesn't seem to check it(?) 45 + 46 + The beginning of an extended attribute block is in 47 + ``struct ext4_xattr_header``, which is 32 bytes long: 48 + 49 + .. list-table:: 50 + :widths: 1 1 1 77 51 + :header-rows: 1 52 + 53 + * - Offset 54 + - Type 55 + - Name 56 + - Description 57 + * - 0x0 58 + - \_\_le32 59 + - h\_magic 60 + - Magic number for identification, 0xEA020000. 61 + * - 0x4 62 + - \_\_le32 63 + - h\_refcount 64 + - Reference count. 65 + * - 0x8 66 + - \_\_le32 67 + - h\_blocks 68 + - Number of disk blocks used. 69 + * - 0xC 70 + - \_\_le32 71 + - h\_hash 72 + - Hash value of all attributes. 73 + * - 0x10 74 + - \_\_le32 75 + - h\_checksum 76 + - Checksum of the extended attribute block. 77 + * - 0x14 78 + - \_\_u32 79 + - h\_reserved[2] 80 + - Zero. 81 + 82 + The checksum is calculated against the FS UUID, the 64-bit block number 83 + of the extended attribute block, and the entire block (header + 84 + entries). 85 + 86 + Following the ``struct ext4_xattr_header`` or 87 + ``struct ext4_xattr_ibody_header`` is an array of 88 + ``struct ext4_xattr_entry``; each of these entries is at least 16 bytes 89 + long. When stored in an external block, the ``struct ext4_xattr_entry`` 90 + entries must be stored in sorted order. The sort order is 91 + ``e_name_index``, then ``e_name_len``, and finally ``e_name``. 92 + Attributes stored inside an inode do not need be stored in sorted order. 93 + 94 + .. list-table:: 95 + :widths: 1 1 1 77 96 + :header-rows: 1 97 + 98 + * - Offset 99 + - Type 100 + - Name 101 + - Description 102 + * - 0x0 103 + - \_\_u8 104 + - e\_name\_len 105 + - Length of name. 106 + * - 0x1 107 + - \_\_u8 108 + - e\_name\_index 109 + - Attribute name index. There is a discussion of this below. 110 + * - 0x2 111 + - \_\_le16 112 + - e\_value\_offs 113 + - Location of this attribute's value on the disk block where it is stored. 114 + Multiple attributes can share the same value. For an inode attribute 115 + this value is relative to the start of the first entry; for a block this 116 + value is relative to the start of the block (i.e. the header). 117 + * - 0x4 118 + - \_\_le32 119 + - e\_value\_inum 120 + - The inode where the value is stored. Zero indicates the value is in the 121 + same block as this entry. This field is only used if the 122 + INCOMPAT\_EA\_INODE feature is enabled. 123 + * - 0x8 124 + - \_\_le32 125 + - e\_value\_size 126 + - Length of attribute value. 127 + * - 0xC 128 + - \_\_le32 129 + - e\_hash 130 + - Hash value of attribute name and attribute value. The kernel doesn't 131 + update the hash for in-inode attributes, so for that case this value 132 + must be zero, because e2fsck validates any non-zero hash regardless of 133 + where the xattr lives. 134 + * - 0x10 135 + - char 136 + - e\_name[e\_name\_len] 137 + - Attribute name. Does not include trailing NULL. 138 + 139 + Attribute values can follow the end of the entry table. There appears to 140 + be a requirement that they be aligned to 4-byte boundaries. The values 141 + are stored starting at the end of the block and grow towards the 142 + xattr\_header/xattr\_entry table. When the two collide, the overflow is 143 + put into a separate disk block. If the disk block fills up, the 144 + filesystem returns -ENOSPC. 145 + 146 + The first four fields of the ``ext4_xattr_entry`` are set to zero to 147 + mark the end of the key list. 148 + 149 + Attribute Name Indices 150 + ~~~~~~~~~~~~~~~~~~~~~~ 151 + 152 + Logically speaking, extended attributes are a series of key=value pairs. 153 + The keys are assumed to be NULL-terminated strings. To reduce the amount 154 + of on-disk space that the keys consume, the beginning of the key string 155 + is matched against the attribute name index. If a match is found, the 156 + attribute name index field is set, and matching string is removed from 157 + the key name. Here is a map of name index values to key prefixes: 158 + 159 + .. list-table:: 160 + :widths: 1 79 161 + :header-rows: 1 162 + 163 + * - Name Index 164 + - Key Prefix 165 + * - 0 166 + - (no prefix) 167 + * - 1 168 + - “user.” 169 + * - 2 170 + - “system.posix\_acl\_access” 171 + * - 3 172 + - “system.posix\_acl\_default” 173 + * - 4 174 + - “trusted.” 175 + * - 6 176 + - “security.” 177 + * - 7 178 + - “system.” (inline\_data only?) 179 + * - 8 180 + - “system.richacl” (SuSE kernels only?) 181 + 182 + For example, if the attribute key is “user.fubar”, the attribute name 183 + index is set to 1 and the “fubar” name is recorded on disk. 184 + 185 + POSIX ACLs 186 + ~~~~~~~~~~ 187 + 188 + POSIX ACLs are stored in a reduced version of the Linux kernel (and 189 + libacl's) internal ACL format. The key difference is that the version 190 + number is different (1) and the ``e_id`` field is only stored for named 191 + user and group ACLs.

+22

Documentation/filesystems/ext4/ondisk/bigalloc.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Bigalloc 4 + -------- 5 + 6 + At the moment, the default size of a block is 4KiB, which is a commonly 7 + supported page size on most MMU-capable hardware. This is fortunate, as 8 + ext4 code is not prepared to handle the case where the block size 9 + exceeds the page size. However, for a filesystem of mostly huge files, 10 + it is desirable to be able to allocate disk blocks in units of multiple 11 + blocks to reduce both fragmentation and metadata overhead. The 12 + `bigalloc <Bigalloc>`__ feature provides exactly this ability. The 13 + administrator can set a block cluster size at mkfs time (which is stored 14 + in the s\_log\_cluster\_size field in the superblock); from then on, the 15 + block bitmaps track clusters, not individual blocks. This means that 16 + block groups can be several gigabytes in size (instead of just 128MiB); 17 + however, the minimum allocation unit becomes a cluster, not a block, 18 + even for directories. TaoBao had a patchset to extend the “use units of 19 + clusters instead of blocks” to the extent tree, though it is not clear 20 + where those patches went-- they eventually morphed into “extent tree v2” 21 + but that code has not landed as of May 2015. 22 +

+28

Documentation/filesystems/ext4/ondisk/bitmaps.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Block and inode Bitmaps 4 + ----------------------- 5 + 6 + The data block bitmap tracks the usage of data blocks within the block 7 + group. 8 + 9 + The inode bitmap records which entries in the inode table are in use. 10 + 11 + As with most bitmaps, one bit represents the usage status of one data 12 + block or inode table entry. This implies a block group size of 8 \* 13 + number\_of\_bytes\_in\_a\_logical\_block. 14 + 15 + NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts 16 + of the kernel and e2fsprogs code pretends that the block bitmap contains 17 + zeros (i.e. all blocks in the group are free). However, it is not 18 + necessarily the case that no blocks are in use -- if ``meta_bg`` is set, 19 + the bitmaps and group descriptor live inside the group. Unfortunately, 20 + ext2fs\_test\_block\_bitmap2() will return '0' for those locations, 21 + which produces confusing debugfs output. 22 + 23 + Inode Table 24 + ----------- 25 + Inode tables are statically allocated at mkfs time. Each block group 26 + descriptor points to the start of the table, and the superblock records 27 + the number of inodes per group. See the section on inodes for more 28 + information.

+135

Documentation/filesystems/ext4/ondisk/blockgroup.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Layout 4 + ------ 5 + 6 + The layout of a standard block group is approximately as follows (each 7 + of these fields is discussed in a separate section below): 8 + 9 + .. list-table:: 10 + :widths: 1 1 1 1 1 1 1 1 11 + :header-rows: 1 12 + 13 + * - Group 0 Padding 14 + - ext4 Super Block 15 + - Group Descriptors 16 + - Reserved GDT Blocks 17 + - Data Block Bitmap 18 + - inode Bitmap 19 + - inode Table 20 + - Data Blocks 21 + * - 1024 bytes 22 + - 1 block 23 + - many blocks 24 + - many blocks 25 + - 1 block 26 + - 1 block 27 + - many blocks 28 + - many more blocks 29 + 30 + For the special case of block group 0, the first 1024 bytes are unused, 31 + to allow for the installation of x86 boot sectors and other oddities. 32 + The superblock will start at offset 1024 bytes, whichever block that 33 + happens to be (usually 0). However, if for some reason the block size = 34 + 1024, then block 0 is marked in use and the superblock goes in block 1. 35 + For all other block groups, there is no padding. 36 + 37 + The ext4 driver primarily works with the superblock and the group 38 + descriptors that are found in block group 0. Redundant copies of the 39 + superblock and group descriptors are written to some of the block groups 40 + across the disk in case the beginning of the disk gets trashed, though 41 + not all block groups necessarily host a redundant copy (see following 42 + paragraph for more details). If the group does not have a redundant 43 + copy, the block group begins with the data block bitmap. Note also that 44 + when the filesystem is freshly formatted, mkfs will allocate “reserve 45 + GDT block” space after the block group descriptors and before the start 46 + of the block bitmaps to allow for future expansion of the filesystem. By 47 + default, a filesystem is allowed to increase in size by a factor of 48 + 1024x over the original filesystem size. 49 + 50 + The location of the inode table is given by ``grp.bg_inode_table_*``. It 51 + is continuous range of blocks large enough to contain 52 + ``sb.s_inodes_per_group * sb.s_inode_size`` bytes. 53 + 54 + As for the ordering of items in a block group, it is generally 55 + established that the super block and the group descriptor table, if 56 + present, will be at the beginning of the block group. The bitmaps and 57 + the inode table can be anywhere, and it is quite possible for the 58 + bitmaps to come after the inode table, or for both to be in different 59 + groups (flex\_bg). Leftover space is used for file data blocks, indirect 60 + block maps, extent tree blocks, and extended attributes. 61 + 62 + Flexible Block Groups 63 + --------------------- 64 + 65 + Starting in ext4, there is a new feature called flexible block groups 66 + (flex\_bg). In a flex\_bg, several block groups are tied together as one 67 + logical block group; the bitmap spaces and the inode table space in the 68 + first block group of the flex\_bg are expanded to include the bitmaps 69 + and inode tables of all other block groups in the flex\_bg. For example, 70 + if the flex\_bg size is 4, then group 0 will contain (in order) the 71 + superblock, group descriptors, data block bitmaps for groups 0-3, inode 72 + bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining 73 + space in group 0 is for file data. The effect of this is to group the 74 + block metadata close together for faster loading, and to enable large 75 + files to be continuous on disk. Backup copies of the superblock and 76 + group descriptors are always at the beginning of block groups, even if 77 + flex\_bg is enabled. The number of block groups that make up a flex\_bg 78 + is given by 2 ^ ``sb.s_log_groups_per_flex``. 79 + 80 + Meta Block Groups 81 + ----------------- 82 + 83 + Without the option META\_BG, for safety concerns, all block group 84 + descriptors copies are kept in the first block group. Given the default 85 + 128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 86 + can have at most 2^27/64 = 2^21 block groups. This limits the entire 87 + filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB. 88 + 89 + The solution to this problem is to use the metablock group feature 90 + (META\_BG), which is already in ext3 for all 2.6 releases. With the 91 + META\_BG feature, ext4 filesystems are partitioned into many metablock 92 + groups. Each metablock group is a cluster of block groups whose group 93 + descriptor structures can be stored in a single disk block. For ext4 94 + filesystems with 4 KB block size, a single metablock group partition 95 + includes 64 block groups, or 8 GiB of disk space. The metablock group 96 + feature moves the location of the group descriptors from the congested 97 + first block group of the whole filesystem into the first group of each 98 + metablock group itself. The backups are in the second and last group of 99 + each metablock group. This increases the 2^21 maximum block groups limit 100 + to the hard limit 2^32, allowing support for a 512PiB filesystem. 101 + 102 + The change in the filesystem format replaces the current scheme where 103 + the superblock is followed by a variable-length set of block group 104 + descriptors. Instead, the superblock and a single block group descriptor 105 + block is placed at the beginning of the first, second, and last block 106 + groups in a meta-block group. A meta-block group is a collection of 107 + block groups which can be described by a single block group descriptor 108 + block. Since the size of the block group descriptor structure is 32 109 + bytes, a meta-block group contains 32 block groups for filesystems with 110 + a 1KB block size, and 128 block groups for filesystems with a 4KB 111 + blocksize. Filesystems can either be created using this new block group 112 + descriptor layout, or existing filesystems can be resized on-line, and 113 + the field s\_first\_meta\_bg in the superblock will indicate the first 114 + block group using this new layout. 115 + 116 + Please see an important note about ``BLOCK_UNINIT`` in the section about 117 + block and inode bitmaps. 118 + 119 + Lazy Block Group Initialization 120 + ------------------------------- 121 + 122 + A new feature for ext4 are three block group descriptor flags that 123 + enable mkfs to skip initializing other parts of the block group 124 + metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean 125 + that the inode and block bitmaps for that group can be calculated and 126 + therefore the on-disk bitmap blocks are not initialized. This is 127 + generally the case for an empty block group or a block group containing 128 + only fixed-location block group metadata. The INODE\_ZEROED flag means 129 + that the inode table has been initialized; mkfs will unset this flag and 130 + rely on the kernel to initialize the inode tables in the background. 131 + 132 + By not writing zeroes to the bitmaps and inode table, mkfs time is 133 + reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, 134 + but the dumpe2fs output prints this as “uninit\_bg”. They are the same 135 + thing.

+49

Documentation/filesystems/ext4/ondisk/blockmap.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 4 + | i.i\_block Offset | Where It Points | 5 + +=====================+==============================================================================================================================================================================================================================+ 6 + | 0 to 11 | Direct map to file blocks 0 to 11. | 7 + +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 8 + | 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) | 9 + | | | 10 + | | +------------------------------+--------------------------------------------------------------------+ | 11 + | | | Indirect Block Offset | Where It Points | | 12 + | | +==============================+====================================================================+ | 13 + | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | 14 + | | +------------------------------+--------------------------------------------------------------------+ | 15 + +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 16 + | 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) | 17 + | | | 18 + | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | 19 + | | | Double Indirect Block Offset | Where It Points | | 20 + | | +================================+=========================================================================================================+ | 21 + | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | 22 + | | | | | | 23 + | | | | +------------------------------+--------------------------------------------------------------------+ | | 24 + | | | | | Indirect Block Offset | Where It Points | | | 25 + | | | | +==============================+====================================================================+ | | 26 + | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | 27 + | | | | +------------------------------+--------------------------------------------------------------------+ | | 28 + | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | 29 + +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ 30 + | 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) | 31 + | | | 32 + | | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | 33 + | | | Triple Indirect Block Offset | Where It Points | | 34 + | | +================================+================================================================================================================================================+ | 35 + | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | | 36 + | | | | | | 37 + | | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | 38 + | | | | | Double Indirect Block Offset | Where It Points | | | 39 + | | | | +================================+=========================================================================================================+ | | 40 + | | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | 41 + | | | | | | | | | 42 + | | | | | | +------------------------------+--------------------------------------------------------------------+ | | | 43 + | | | | | | | Indirect Block Offset | Where It Points | | | | 44 + | | | | | | +==============================+====================================================================+ | | | 45 + | | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | 46 + | | | | | | +------------------------------+--------------------------------------------------------------------+ | | | 47 + | | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | 48 + | | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | 49 + +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

+142

Documentation/filesystems/ext4/ondisk/blocks.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Blocks 4 + ------ 5 + 6 + ext4 allocates storage space in units of “blocks”. A block is a group of 7 + sectors between 1KiB and 64KiB, and the number of sectors must be an 8 + integral power of 2. Blocks are in turn grouped into larger units called 9 + block groups. Block size is specified at mkfs time and typically is 10 + 4KiB. You may experience mounting problems if block size is greater than 11 + page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory 12 + pages). By default a filesystem can contain 2^32 blocks; if the '64bit' 13 + feature is enabled, then a filesystem can have 2^64 blocks. 14 + 15 + For 32-bit filesystems, limits are as follows: 16 + 17 + .. list-table:: 18 + :widths: 1 1 1 1 1 19 + :header-rows: 1 20 + 21 + * - Item 22 + - 1KiB 23 + - 2KiB 24 + - 4KiB 25 + - 64KiB 26 + * - Blocks 27 + - 2^32 28 + - 2^32 29 + - 2^32 30 + - 2^32 31 + * - Inodes 32 + - 2^32 33 + - 2^32 34 + - 2^32 35 + - 2^32 36 + * - File System Size 37 + - 4TiB 38 + - 8TiB 39 + - 16TiB 40 + - 256PiB 41 + * - Blocks Per Block Group 42 + - 8,192 43 + - 16,384 44 + - 32,768 45 + - 524,288 46 + * - Inodes Per Block Group 47 + - 8,192 48 + - 16,384 49 + - 32,768 50 + - 524,288 51 + * - Block Group Size 52 + - 8MiB 53 + - 32MiB 54 + - 128MiB 55 + - 32GiB 56 + * - Blocks Per File, Extents 57 + - 2^32 58 + - 2^32 59 + - 2^32 60 + - 2^32 61 + * - Blocks Per File, Block Maps 62 + - 16,843,020 63 + - 134,480,396 64 + - 1,074,791,436 65 + - 4,398,314,962,956 (really 2^32 due to field size limitations) 66 + * - File Size, Extents 67 + - 4TiB 68 + - 8TiB 69 + - 16TiB 70 + - 256TiB 71 + * - File Size, Block Maps 72 + - 16GiB 73 + - 256GiB 74 + - 4TiB 75 + - 256TiB 76 + 77 + For 64-bit filesystems, limits are as follows: 78 + 79 + .. list-table:: 80 + :widths: 1 1 1 1 1 81 + :header-rows: 1 82 + 83 + * - Item 84 + - 1KiB 85 + - 2KiB 86 + - 4KiB 87 + - 64KiB 88 + * - Blocks 89 + - 2^64 90 + - 2^64 91 + - 2^64 92 + - 2^64 93 + * - Inodes 94 + - 2^32 95 + - 2^32 96 + - 2^32 97 + - 2^32 98 + * - File System Size 99 + - 16ZiB 100 + - 32ZiB 101 + - 64ZiB 102 + - 1YiB 103 + * - Blocks Per Block Group 104 + - 8,192 105 + - 16,384 106 + - 32,768 107 + - 524,288 108 + * - Inodes Per Block Group 109 + - 8,192 110 + - 16,384 111 + - 32,768 112 + - 524,288 113 + * - Block Group Size 114 + - 8MiB 115 + - 32MiB 116 + - 128MiB 117 + - 32GiB 118 + * - Blocks Per File, Extents 119 + - 2^32 120 + - 2^32 121 + - 2^32 122 + - 2^32 123 + * - Blocks Per File, Block Maps 124 + - 16,843,020 125 + - 134,480,396 126 + - 1,074,791,436 127 + - 4,398,314,962,956 (really 2^32 due to field size limitations) 128 + * - File Size, Extents 129 + - 4TiB 130 + - 8TiB 131 + - 16TiB 132 + - 256TiB 133 + * - File Size, Block Maps 134 + - 16GiB 135 + - 256GiB 136 + - 4TiB 137 + - 256TiB 138 + 139 + Note: Files not using extents (i.e. files using block maps) must be 140 + placed within the first 2^32 blocks of a filesystem. Files with extents 141 + must be placed within the first 2^48 blocks of a filesystem. It's not 142 + clear what happens with larger filesystems.

+73

Documentation/filesystems/ext4/ondisk/checksums.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Checksums 4 + --------- 5 + 6 + Starting in early 2012, metadata checksums were added to all major ext4 7 + and jbd2 data structures. The associated feature flag is metadata\_csum. 8 + The desired checksum algorithm is indicated in the superblock, though as 9 + of October 2012 the only supported algorithm is crc32c. Some data 10 + structures did not have space to fit a full 32-bit checksum, so only the 11 + lower 16 bits are stored. Enabling the 64bit feature increases the data 12 + structure size so that full 32-bit checksums can be stored for many data 13 + structures. However, existing 32-bit filesystems cannot be extended to 14 + enable 64bit mode, at least not without the experimental resize2fs 15 + patches to do so. 16 + 17 + Existing filesystems can have checksumming added by running 18 + ``tune2fs -O metadata_csum`` against the underlying device. If tune2fs 19 + encounters directory blocks that lack sufficient empty space to add a 20 + checksum, it will request that you run ``e2fsck -D`` to have the 21 + directories rebuilt with checksums. This has the added benefit of 22 + removing slack space from the directory files and rebalancing the htree 23 + indexes. If you \_ignore\_ this step, your directories will not be 24 + protected by a checksum! 25 + 26 + The following table describes the data elements that go into each type 27 + of checksum. The checksum function is whatever the superblock describes 28 + (crc32c as of October 2013) unless noted otherwise. 29 + 30 + .. list-table:: 31 + :widths: 1 1 4 32 + :header-rows: 1 33 + 34 + * - Metadata 35 + - Length 36 + - Ingredients 37 + * - Superblock 38 + - \_\_le32 39 + - The entire superblock up to the checksum field. The UUID lives inside 40 + the superblock. 41 + * - MMP 42 + - \_\_le32 43 + - UUID + the entire MMP block up to the checksum field. 44 + * - Extended Attributes 45 + - \_\_le32 46 + - UUID + the entire extended attribute block. The checksum field is set to 47 + zero. 48 + * - Directory Entries 49 + - \_\_le32 50 + - UUID + inode number + inode generation + the directory block up to the 51 + fake entry enclosing the checksum field. 52 + * - HTREE Nodes 53 + - \_\_le32 54 + - UUID + inode number + inode generation + all valid extents + HTREE tail. 55 + The checksum field is set to zero. 56 + * - Extents 57 + - \_\_le32 58 + - UUID + inode number + inode generation + the entire extent block up to 59 + the checksum field. 60 + * - Bitmaps 61 + - \_\_le32 or \_\_le16 62 + - UUID + the entire bitmap. Checksums are stored in the group descriptor, 63 + and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) 64 + * - Inodes 65 + - \_\_le32 66 + - UUID + inode number + inode generation + the entire inode. The checksum 67 + field is set to zero. Each inode has its own checksum. 68 + * - Group Descriptors 69 + - \_\_le16 70 + - If metadata\_csum, then UUID + group number + the entire descriptor; 71 + else if gdt\_csum, then crc16(UUID + group number + the entire 72 + descriptor). In all cases, only the lower 16 bits are stored. 73 +

+426

Documentation/filesystems/ext4/ondisk/directory.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Directory Entries 4 + ----------------- 5 + 6 + In an ext4 filesystem, a directory is more or less a flat file that maps 7 + an arbitrary byte string (usually ASCII) to an inode number on the 8 + filesystem. There can be many directory entries across the filesystem 9 + that reference the same inode number--these are known as hard links, and 10 + that is why hard links cannot reference files on other filesystems. As 11 + such, directory entries are found by reading the data block(s) 12 + associated with a directory file for the particular directory entry that 13 + is desired. 14 + 15 + Linear (Classic) Directories 16 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 17 + 18 + By default, each directory lists its entries in an “almost-linear” 19 + array. I write “almost” because it's not a linear array in the memory 20 + sense because directory entries are not split across filesystem blocks. 21 + Therefore, it is more accurate to say that a directory is a series of 22 + data blocks and that each block contains a linear array of directory 23 + entries. The end of each per-block array is signified by reaching the 24 + end of the block; the last entry in the block has a record length that 25 + takes it all the way to the end of the block. The end of the entire 26 + directory is of course signified by reaching the end of the file. Unused 27 + directory entries are signified by inode = 0. By default the filesystem 28 + uses ``struct ext4_dir_entry_2`` for directory entries unless the 29 + “filetype” feature flag is not set, in which case it uses 30 + ``struct ext4_dir_entry``. 31 + 32 + The original directory entry format is ``struct ext4_dir_entry``, which 33 + is at most 263 bytes long, though on disk you'll need to reference 34 + ``dirent.rec_len`` to know for sure. 35 + 36 + .. list-table:: 37 + :widths: 1 1 1 77 38 + :header-rows: 1 39 + 40 + * - Offset 41 + - Size 42 + - Name 43 + - Description 44 + * - 0x0 45 + - \_\_le32 46 + - inode 47 + - Number of the inode that this directory entry points to. 48 + * - 0x4 49 + - \_\_le16 50 + - rec\_len 51 + - Length of this directory entry. Must be a multiple of 4. 52 + * - 0x6 53 + - \_\_le16 54 + - name\_len 55 + - Length of the file name. 56 + * - 0x8 57 + - char 58 + - name[EXT4\_NAME\_LEN] 59 + - File name. 60 + 61 + Since file names cannot be longer than 255 bytes, the new directory 62 + entry format shortens the rec\_len field and uses the space for a file 63 + type flag, probably to avoid having to load every inode during directory 64 + tree traversal. This format is ``ext4_dir_entry_2``, which is at most 65 + 263 bytes long, though on disk you'll need to reference 66 + ``dirent.rec_len`` to know for sure. 67 + 68 + .. list-table:: 69 + :widths: 1 1 1 77 70 + :header-rows: 1 71 + 72 + * - Offset 73 + - Size 74 + - Name 75 + - Description 76 + * - 0x0 77 + - \_\_le32 78 + - inode 79 + - Number of the inode that this directory entry points to. 80 + * - 0x4 81 + - \_\_le16 82 + - rec\_len 83 + - Length of this directory entry. 84 + * - 0x6 85 + - \_\_u8 86 + - name\_len 87 + - Length of the file name. 88 + * - 0x7 89 + - \_\_u8 90 + - file\_type 91 + - File type code, see ftype_ table below. 92 + * - 0x8 93 + - char 94 + - name[EXT4\_NAME\_LEN] 95 + - File name. 96 + 97 + .. _ftype: 98 + 99 + The directory file type is one of the following values: 100 + 101 + .. list-table:: 102 + :widths: 1 79 103 + :header-rows: 1 104 + 105 + * - Value 106 + - Description 107 + * - 0x0 108 + - Unknown. 109 + * - 0x1 110 + - Regular file. 111 + * - 0x2 112 + - Directory. 113 + * - 0x3 114 + - Character device file. 115 + * - 0x4 116 + - Block device file. 117 + * - 0x5 118 + - FIFO. 119 + * - 0x6 120 + - Socket. 121 + * - 0x7 122 + - Symbolic link. 123 + 124 + In order to add checksums to these classic directory blocks, a phony 125 + ``struct ext4_dir_entry`` is placed at the end of each leaf block to 126 + hold the checksum. The directory entry is 12 bytes long. The inode 127 + number and name\_len fields are set to zero to fool old software into 128 + ignoring an apparently empty directory entry, and the checksum is stored 129 + in the place where the name normally goes. The structure is 130 + ``struct ext4_dir_entry_tail``: 131 + 132 + .. list-table:: 133 + :widths: 1 1 1 77 134 + :header-rows: 1 135 + 136 + * - Offset 137 + - Size 138 + - Name 139 + - Description 140 + * - 0x0 141 + - \_\_le32 142 + - det\_reserved\_zero1 143 + - Inode number, which must be zero. 144 + * - 0x4 145 + - \_\_le16 146 + - det\_rec\_len 147 + - Length of this directory entry, which must be 12. 148 + * - 0x6 149 + - \_\_u8 150 + - det\_reserved\_zero2 151 + - Length of the file name, which must be zero. 152 + * - 0x7 153 + - \_\_u8 154 + - det\_reserved\_ft 155 + - File type, which must be 0xDE. 156 + * - 0x8 157 + - \_\_le32 158 + - det\_checksum 159 + - Directory leaf block checksum. 160 + 161 + The leaf directory block checksum is calculated against the FS UUID, the 162 + directory's inode number, the directory's inode generation number, and 163 + the entire directory entry block up to (but not including) the fake 164 + directory entry. 165 + 166 + Hash Tree Directories 167 + ~~~~~~~~~~~~~~~~~~~~~ 168 + 169 + A linear array of directory entries isn't great for performance, so a 170 + new feature was added to ext3 to provide a faster (but peculiar) 171 + balanced tree keyed off a hash of the directory entry name. If the 172 + EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a 173 + hashed btree (htree) to organize and find directory entries. For 174 + backwards read-only compatibility with ext2, this tree is actually 175 + hidden inside the directory file, masquerading as “empty” directory data 176 + blocks! It was stated previously that the end of the linear directory 177 + entry table was signified with an entry pointing to inode 0; this is 178 + (ab)used to fool the old linear-scan algorithm into thinking that the 179 + rest of the directory block is empty so that it moves on. 180 + 181 + The root of the tree always lives in the first data block of the 182 + directory. By ext2 custom, the '.' and '..' entries must appear at the 183 + beginning of this first block, so they are put here as two 184 + ``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of 185 + the root node contains metadata about the tree and finally a hash->block 186 + map to find nodes that are lower in the htree. If 187 + ``dx_root.info.indirect_levels`` is non-zero then the htree has two 188 + levels; the data block pointed to by the root node's map is an interior 189 + node, which is indexed by a minor hash. Interior nodes in this tree 190 + contains a zeroed out ``struct ext4_dir_entry_2`` followed by a 191 + minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear 192 + array of all ``struct ext4_dir_entry_2``; all of these entries 193 + (presumably) hash to the same value. If there is an overflow, the 194 + entries simply overflow into the next leaf node, and the 195 + least-significant bit of the hash (in the interior node map) that gets 196 + us to this next leaf node is set. 197 + 198 + To traverse the directory as a htree, the code calculates the hash of 199 + the desired file name and uses it to find the corresponding block 200 + number. If the tree is flat, the block is a linear array of directory 201 + entries that can be searched; otherwise, the minor hash of the file name 202 + is computed and used against this second block to find the corresponding 203 + third block number. That third block number will be a linear array of 204 + directory entries. 205 + 206 + To traverse the directory as a linear array (such as the old code does), 207 + the code simply reads every data block in the directory. The blocks used 208 + for the htree will appear to have no entries (aside from '.' and '..') 209 + and so only the leaf nodes will appear to have any interesting content. 210 + 211 + The root of the htree is in ``struct dx_root``, which is the full length 212 + of a data block: 213 + 214 + .. list-table:: 215 + :widths: 1 1 1 77 216 + :header-rows: 1 217 + 218 + * - Offset 219 + - Type 220 + - Name 221 + - Description 222 + * - 0x0 223 + - \_\_le32 224 + - dot.inode 225 + - inode number of this directory. 226 + * - 0x4 227 + - \_\_le16 228 + - dot.rec\_len 229 + - Length of this record, 12. 230 + * - 0x6 231 + - u8 232 + - dot.name\_len 233 + - Length of the name, 1. 234 + * - 0x7 235 + - u8 236 + - dot.file\_type 237 + - File type of this entry, 0x2 (directory) (if the feature flag is set). 238 + * - 0x8 239 + - char 240 + - dot.name[4] 241 + - “.\\0\\0\\0” 242 + * - 0xC 243 + - \_\_le32 244 + - dotdot.inode 245 + - inode number of parent directory. 246 + * - 0x10 247 + - \_\_le16 248 + - dotdot.rec\_len 249 + - block\_size - 12. The record length is long enough to cover all htree 250 + data. 251 + * - 0x12 252 + - u8 253 + - dotdot.name\_len 254 + - Length of the name, 2. 255 + * - 0x13 256 + - u8 257 + - dotdot.file\_type 258 + - File type of this entry, 0x2 (directory) (if the feature flag is set). 259 + * - 0x14 260 + - char 261 + - dotdot\_name[4] 262 + - “..\\0\\0” 263 + * - 0x18 264 + - \_\_le32 265 + - struct dx\_root\_info.reserved\_zero 266 + - Zero. 267 + * - 0x1C 268 + - u8 269 + - struct dx\_root\_info.hash\_version 270 + - Hash type, see dirhash_ table below. 271 + * - 0x1D 272 + - u8 273 + - struct dx\_root\_info.info\_length 274 + - Length of the tree information, 0x8. 275 + * - 0x1E 276 + - u8 277 + - struct dx\_root\_info.indirect\_levels 278 + - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR 279 + feature is set; cannot be larger than 2 otherwise. 280 + * - 0x1F 281 + - u8 282 + - struct dx\_root\_info.unused\_flags 283 + - 284 + * - 0x20 285 + - \_\_le16 286 + - limit 287 + - Maximum number of dx\_entries that can follow this header, plus 1 for 288 + the header itself. 289 + * - 0x22 290 + - \_\_le16 291 + - count 292 + - Actual number of dx\_entries that follow this header, plus 1 for the 293 + header itself. 294 + * - 0x24 295 + - \_\_le32 296 + - block 297 + - The block number (within the directory file) that goes with hash=0. 298 + * - 0x28 299 + - struct dx\_entry 300 + - entries[0] 301 + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. 302 + 303 + .. _dirhash: 304 + 305 + The directory hash is one of the following values: 306 + 307 + .. list-table:: 308 + :widths: 1 79 309 + :header-rows: 1 310 + 311 + * - Value 312 + - Description 313 + * - 0x0 314 + - Legacy. 315 + * - 0x1 316 + - Half MD4. 317 + * - 0x2 318 + - Tea. 319 + * - 0x3 320 + - Legacy, unsigned. 321 + * - 0x4 322 + - Half MD4, unsigned. 323 + * - 0x5 324 + - Tea, unsigned. 325 + 326 + Interior nodes of an htree are recorded as ``struct dx_node``, which is 327 + also the full length of a data block: 328 + 329 + .. list-table:: 330 + :widths: 1 1 1 77 331 + :header-rows: 1 332 + 333 + * - Offset 334 + - Type 335 + - Name 336 + - Description 337 + * - 0x0 338 + - \_\_le32 339 + - fake.inode 340 + - Zero, to make it look like this entry is not in use. 341 + * - 0x4 342 + - \_\_le16 343 + - fake.rec\_len 344 + - The size of the block, in order to hide all of the dx\_node data. 345 + * - 0x6 346 + - u8 347 + - name\_len 348 + - Zero. There is no name for this “unused” directory entry. 349 + * - 0x7 350 + - u8 351 + - file\_type 352 + - Zero. There is no file type for this “unused” directory entry. 353 + * - 0x8 354 + - \_\_le16 355 + - limit 356 + - Maximum number of dx\_entries that can follow this header, plus 1 for 357 + the header itself. 358 + * - 0xA 359 + - \_\_le16 360 + - count 361 + - Actual number of dx\_entries that follow this header, plus 1 for the 362 + header itself. 363 + * - 0xE 364 + - \_\_le32 365 + - block 366 + - The block number (within the directory file) that goes with the lowest 367 + hash value of this block. This value is stored in the parent block. 368 + * - 0x12 369 + - struct dx\_entry 370 + - entries[0] 371 + - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. 372 + 373 + The hash maps that exist in both ``struct dx_root`` and 374 + ``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes 375 + long: 376 + 377 + .. list-table:: 378 + :widths: 1 1 1 77 379 + :header-rows: 1 380 + 381 + * - Offset 382 + - Type 383 + - Name 384 + - Description 385 + * - 0x0 386 + - \_\_le32 387 + - hash 388 + - Hash code. 389 + * - 0x4 390 + - \_\_le32 391 + - block 392 + - Block number (within the directory file, not filesystem blocks) of the 393 + next node in the htree. 394 + 395 + (If you think this is all quite clever and peculiar, so does the 396 + author.) 397 + 398 + If metadata checksums are enabled, the last 8 bytes of the directory 399 + block (precisely the length of one dx\_entry) are used to store a 400 + ``struct dx_tail``, which contains the checksum. The ``limit`` and 401 + ``count`` entries in the dx\_root/dx\_node structures are adjusted as 402 + necessary to fit the dx\_tail into the block. If there is no space for 403 + the dx\_tail, the user is notified to run e2fsck -D to rebuild the 404 + directory index (which will ensure that there's space for the checksum. 405 + The dx\_tail structure is 8 bytes long and looks like this: 406 + 407 + .. list-table:: 408 + :widths: 1 1 1 77 409 + :header-rows: 1 410 + 411 + * - Offset 412 + - Type 413 + - Name 414 + - Description 415 + * - 0x0 416 + - u32 417 + - dt\_reserved 418 + - Zero. 419 + * - 0x4 420 + - \_\_le32 421 + - dt\_checksum 422 + - Checksum of the htree directory block. 423 + 424 + The checksum is calculated against the FS UUID, the htree index header 425 + (dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in 426 + use, and the tail block (dx\_tail).

+12

Documentation/filesystems/ext4/ondisk/dynamic.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Dynamic Structures 4 + ================== 5 + 6 + Dynamic metadata are created on the fly when files and blocks are 7 + allocated to files. 8 + 9 + .. include:: inodes.rst 10 + .. include:: ifork.rst 11 + .. include:: directory.rst 12 + .. include:: attributes.rst

+18

Documentation/filesystems/ext4/ondisk/eainode.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Large Extended Attribute Values 4 + ------------------------------- 5 + 6 + To enable ext4 to store extended attribute values that do not fit in the 7 + inode or in the single extended attribute block attached to an inode, 8 + the EA\_INODE feature allows us to store the value in the data blocks of 9 + a regular file inode. This “EA inode” is linked only from the extended 10 + attribute name index and must not appear in a directory entry. The 11 + inode's i\_atime field is used to store a checksum of the xattr value; 12 + and i\_ctime/i\_version store a 64-bit reference count, which enables 13 + sharing of large xattr values between multiple owning inodes. For 14 + backward compatibility with older versions of this feature, the 15 + i\_mtime/i\_generation *may* store a back-reference to the inode number 16 + and i\_generation of the **one** owning inode (in cases where the EA 17 + inode is not referenced by multiple inodes) to verify that the EA inode 18 + is the correct one being accessed.

+13

Documentation/filesystems/ext4/ondisk/globals.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Global Structures 4 + ================= 5 + 6 + The filesystem is sharded into a number of block groups, each of which 7 + have static metadata at fixed locations. 8 + 9 + .. include:: super.rst 10 + .. include:: group_descr.rst 11 + .. include:: bitmaps.rst 12 + .. include:: mmp.rst 13 + .. include:: journal.rst

+170

Documentation/filesystems/ext4/ondisk/group_descr.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Block Group Descriptors 4 + ----------------------- 5 + 6 + Each block group on the filesystem has one of these descriptors 7 + associated with it. As noted in the Layout section above, the group 8 + descriptors (if present) are the second item in the block group. The 9 + standard configuration is for each block group to contain a full copy of 10 + the block group descriptor table unless the sparse\_super feature flag 11 + is set. 12 + 13 + Notice how the group descriptor records the location of both bitmaps and 14 + the inode table (i.e. they can float). This means that within a block 15 + group, the only data structures with fixed locations are the superblock 16 + and the group descriptor table. The flex\_bg mechanism uses this 17 + property to group several block groups into a flex group and lay out all 18 + of the groups' bitmaps and inode tables into one long run in the first 19 + group of the flex group. 20 + 21 + If the meta\_bg feature flag is set, then several block groups are 22 + grouped together into a meta group. Note that in the meta\_bg case, 23 + however, the first and last two block groups within the larger meta 24 + group contain only group descriptors for the groups inside the meta 25 + group. 26 + 27 + flex\_bg and meta\_bg do not appear to be mutually exclusive features. 28 + 29 + In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the 30 + block group descriptor was only 32 bytes long and therefore ends at 31 + bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the 32 + block group descriptor expands to at least the 64 bytes described below; 33 + the size is stored in the superblock. 34 + 35 + If gdt\_csum is set and metadata\_csum is not set, the block group 36 + checksum is the crc16 of the FS UUID, the group number, and the group 37 + descriptor structure. If metadata\_csum is set, then the block group 38 + checksum is the lower 16 bits of the checksum of the FS UUID, the group 39 + number, and the group descriptor structure. Both block and inode bitmap 40 + checksums are calculated against the FS UUID, the group number, and the 41 + entire bitmap. 42 + 43 + The block group descriptor is laid out in ``struct ext4_group_desc``. 44 + 45 + .. list-table:: 46 + :widths: 1 1 1 77 47 + :header-rows: 1 48 + 49 + * - Offset 50 + - Size 51 + - Name 52 + - Description 53 + * - 0x0 54 + - \_\_le32 55 + - bg\_block\_bitmap\_lo 56 + - Lower 32-bits of location of block bitmap. 57 + * - 0x4 58 + - \_\_le32 59 + - bg\_inode\_bitmap\_lo 60 + - Lower 32-bits of location of inode bitmap. 61 + * - 0x8 62 + - \_\_le32 63 + - bg\_inode\_table\_lo 64 + - Lower 32-bits of location of inode table. 65 + * - 0xC 66 + - \_\_le16 67 + - bg\_free\_blocks\_count\_lo 68 + - Lower 16-bits of free block count. 69 + * - 0xE 70 + - \_\_le16 71 + - bg\_free\_inodes\_count\_lo 72 + - Lower 16-bits of free inode count. 73 + * - 0x10 74 + - \_\_le16 75 + - bg\_used\_dirs\_count\_lo 76 + - Lower 16-bits of directory count. 77 + * - 0x12 78 + - \_\_le16 79 + - bg\_flags 80 + - Block group flags. See the bgflags_ table below. 81 + * - 0x14 82 + - \_\_le32 83 + - bg\_exclude\_bitmap\_lo 84 + - Lower 32-bits of location of snapshot exclusion bitmap. 85 + * - 0x18 86 + - \_\_le16 87 + - bg\_block\_bitmap\_csum\_lo 88 + - Lower 16-bits of the block bitmap checksum. 89 + * - 0x1A 90 + - \_\_le16 91 + - bg\_inode\_bitmap\_csum\_lo 92 + - Lower 16-bits of the inode bitmap checksum. 93 + * - 0x1C 94 + - \_\_le16 95 + - bg\_itable\_unused\_lo 96 + - Lower 16-bits of unused inode count. If set, we needn't scan past the 97 + ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the 98 + inode table for this group. 99 + * - 0x1E 100 + - \_\_le16 101 + - bg\_checksum 102 + - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the 103 + RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & 104 + 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. 105 + * - 106 + - 107 + - 108 + - These fields only exist if the 64bit feature is enabled and s_desc_size 109 + > 32. 110 + * - 0x20 111 + - \_\_le32 112 + - bg\_block\_bitmap\_hi 113 + - Upper 32-bits of location of block bitmap. 114 + * - 0x24 115 + - \_\_le32 116 + - bg\_inode\_bitmap\_hi 117 + - Upper 32-bits of location of inodes bitmap. 118 + * - 0x28 119 + - \_\_le32 120 + - bg\_inode\_table\_hi 121 + - Upper 32-bits of location of inodes table. 122 + * - 0x2C 123 + - \_\_le16 124 + - bg\_free\_blocks\_count\_hi 125 + - Upper 16-bits of free block count. 126 + * - 0x2E 127 + - \_\_le16 128 + - bg\_free\_inodes\_count\_hi 129 + - Upper 16-bits of free inode count. 130 + * - 0x30 131 + - \_\_le16 132 + - bg\_used\_dirs\_count\_hi 133 + - Upper 16-bits of directory count. 134 + * - 0x32 135 + - \_\_le16 136 + - bg\_itable\_unused\_hi 137 + - Upper 16-bits of unused inode count. 138 + * - 0x34 139 + - \_\_le32 140 + - bg\_exclude\_bitmap\_hi 141 + - Upper 32-bits of location of snapshot exclusion bitmap. 142 + * - 0x38 143 + - \_\_le16 144 + - bg\_block\_bitmap\_csum\_hi 145 + - Upper 16-bits of the block bitmap checksum. 146 + * - 0x3A 147 + - \_\_le16 148 + - bg\_inode\_bitmap\_csum\_hi 149 + - Upper 16-bits of the inode bitmap checksum. 150 + * - 0x3C 151 + - \_\_u32 152 + - bg\_reserved 153 + - Padding to 64 bytes. 154 + 155 + .. _bgflags: 156 + 157 + Block group flags can be any combination of the following: 158 + 159 + .. list-table:: 160 + :widths: 1 79 161 + :header-rows: 1 162 + 163 + * - Value 164 + - Description 165 + * - 0x1 166 + - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). 167 + * - 0x2 168 + - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). 169 + * - 0x4 170 + - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED).

+194

Documentation/filesystems/ext4/ondisk/ifork.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + The Contents of inode.i\_block 4 + ------------------------------ 5 + 6 + Depending on the type of file an inode describes, the 60 bytes of 7 + storage in ``inode.i_block`` can be used in different ways. In general, 8 + regular files and directories will use it for file block indexing 9 + information, and special files will use it for special purposes. 10 + 11 + Symbolic Links 12 + ~~~~~~~~~~~~~~ 13 + 14 + The target of a symbolic link will be stored in this field if the target 15 + string is less than 60 bytes long. Otherwise, either extents or block 16 + maps will be used to allocate data blocks to store the link target. 17 + 18 + Direct/Indirect Block Addressing 19 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 + 21 + In ext2/3, file block numbers were mapped to logical block numbers by 22 + means of an (up to) three level 1-1 block map. To find the logical block 23 + that stores a particular file block, the code would navigate through 24 + this increasingly complicated structure. Notice that there is neither a 25 + magic number nor a checksum to provide any level of confidence that the 26 + block isn't full of garbage. 27 + 28 + .. ifconfig:: builder != 'latex' 29 + 30 + .. include:: blockmap.rst 31 + 32 + .. ifconfig:: builder == 'latex' 33 + 34 + [Table omitted because LaTeX doesn't support nested tables.] 35 + 36 + Note that with this block mapping scheme, it is necessary to fill out a 37 + lot of mapping data even for a large contiguous file! This inefficiency 38 + led to the creation of the extent mapping scheme, discussed below. 39 + 40 + Notice also that a file using this mapping scheme cannot be placed 41 + higher than 2^32 blocks. 42 + 43 + Extent Tree 44 + ~~~~~~~~~~~ 45 + 46 + In ext4, the file to logical block map has been replaced with an extent 47 + tree. Under the old scheme, allocating a contiguous run of 1,000 blocks 48 + requires an indirect block to map all 1,000 entries; with extents, the 49 + mapping is reduced to a single ``struct ext4_extent`` with 50 + ``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate 51 + very large files with a single extent, at a considerable reduction in 52 + metadata block use, and some improvement in disk efficiency. The inode 53 + must have the extents flag (0x80000) flag set for this feature to be in 54 + use. 55 + 56 + Extents are arranged as a tree. Each node of the tree begins with a 57 + ``struct ext4_extent_header``. If the node is an interior node 58 + (``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries`` 59 + instances of ``struct ext4_extent_idx``; each of these index entries 60 + points to a block containing more nodes in the extent tree. If the node 61 + is a leaf node (``eh.eh_depth == 0``), then the header is followed by 62 + ``eh.eh_entries`` instances of ``struct ext4_extent``; these instances 63 + point to the file's data blocks. The root node of the extent tree is 64 + stored in ``inode.i_block``, which allows for the first four extents to 65 + be recorded without the use of extra metadata blocks. 66 + 67 + The extent tree header is recorded in ``struct ext4_extent_header``, 68 + which is 12 bytes long: 69 + 70 + .. list-table:: 71 + :widths: 1 1 1 77 72 + :header-rows: 1 73 + 74 + * - Offset 75 + - Size 76 + - Name 77 + - Description 78 + * - 0x0 79 + - \_\_le16 80 + - eh\_magic 81 + - Magic number, 0xF30A. 82 + * - 0x2 83 + - \_\_le16 84 + - eh\_entries 85 + - Number of valid entries following the header. 86 + * - 0x4 87 + - \_\_le16 88 + - eh\_max 89 + - Maximum number of entries that could follow the header. 90 + * - 0x6 91 + - \_\_le16 92 + - eh\_depth 93 + - Depth of this extent node in the extent tree. 0 = this extent node 94 + points to data blocks; otherwise, this extent node points to other 95 + extent nodes. The extent tree can be at most 5 levels deep: a logical 96 + block number can be at most ``2^32``, and the smallest ``n`` that 97 + satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. 98 + * - 0x8 99 + - \_\_le32 100 + - eh\_generation 101 + - Generation of the tree. (Used by Lustre, but not standard ext4). 102 + 103 + Internal nodes of the extent tree, also known as index nodes, are 104 + recorded as ``struct ext4_extent_idx``, and are 12 bytes long: 105 + 106 + .. list-table:: 107 + :widths: 1 1 1 77 108 + :header-rows: 1 109 + 110 + * - Offset 111 + - Size 112 + - Name 113 + - Description 114 + * - 0x0 115 + - \_\_le32 116 + - ei\_block 117 + - This index node covers file blocks from 'block' onward. 118 + * - 0x4 119 + - \_\_le32 120 + - ei\_leaf\_lo 121 + - Lower 32-bits of the block number of the extent node that is the next 122 + level lower in the tree. The tree node pointed to can be either another 123 + internal node or a leaf node, described below. 124 + * - 0x8 125 + - \_\_le16 126 + - ei\_leaf\_hi 127 + - Upper 16-bits of the previous field. 128 + * - 0xA 129 + - \_\_u16 130 + - ei\_unused 131 + - 132 + 133 + Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, 134 + and are also 12 bytes long: 135 + 136 + .. list-table:: 137 + :widths: 1 1 1 77 138 + :header-rows: 1 139 + 140 + * - Offset 141 + - Size 142 + - Name 143 + - Description 144 + * - 0x0 145 + - \_\_le32 146 + - ee\_block 147 + - First file block number that this extent covers. 148 + * - 0x4 149 + - \_\_le16 150 + - ee\_len 151 + - Number of blocks covered by extent. If the value of this field is <= 152 + 32768, the extent is initialized. If the value of the field is > 32768, 153 + the extent is uninitialized and the actual extent length is ``ee_len`` - 154 + 32768. Therefore, the maximum length of a initialized extent is 32768 155 + blocks, and the maximum length of an uninitialized extent is 32767. 156 + * - 0x6 157 + - \_\_le16 158 + - ee\_start\_hi 159 + - Upper 16-bits of the block number to which this extent points. 160 + * - 0x8 161 + - \_\_le32 162 + - ee\_start\_lo 163 + - Lower 32-bits of the block number to which this extent points. 164 + 165 + Prior to the introduction of metadata checksums, the extent header + 166 + extent entries always left at least 4 bytes of unallocated space at the 167 + end of each extent tree data block (because (2^x % 12) >= 4). Therefore, 168 + the 32-bit checksum is inserted into this space. The 4 extents in the 169 + inode do not need checksumming, since the inode is already checksummed. 170 + The checksum is calculated against the FS UUID, the inode number, the 171 + inode generation, and the entire extent block leading up to (but not 172 + including) the checksum itself. 173 + 174 + ``struct ext4_extent_tail`` is 4 bytes long: 175 + 176 + .. list-table:: 177 + :widths: 1 1 1 77 178 + :header-rows: 1 179 + 180 + * - Offset 181 + - Size 182 + - Name 183 + - Description 184 + * - 0x0 185 + - \_\_le32 186 + - eb\_checksum 187 + - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) 188 + 189 + Inline Data 190 + ~~~~~~~~~~~ 191 + 192 + If the inline data feature is enabled for the filesystem and the flag is 193 + set for the inode, it is possible that the first 60 bytes of the file 194 + data are stored here.

+9

Documentation/filesystems/ext4/ondisk/index.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + ============================== 4 + Data Structures and Algorithms 5 + ============================== 6 + .. include:: about.rst 7 + .. include:: overview.rst 8 + .. include:: globals.rst 9 + .. include:: dynamic.rst

+37

Documentation/filesystems/ext4/ondisk/inlinedata.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Inline Data 4 + ----------- 5 + 6 + The inline data feature was designed to handle the case that a file's 7 + data is so tiny that it readily fits inside the inode, which 8 + (theoretically) reduces disk block consumption and reduces seeks. If the 9 + file is smaller than 60 bytes, then the data are stored inline in 10 + ``inode.i_block``. If the rest of the file would fit inside the extended 11 + attribute space, then it might be found as an extended attribute 12 + “system.data” within the inode body (“ibody EA”). This of course 13 + constrains the amount of extended attributes one can attach to an inode. 14 + If the data size increases beyond i\_block + ibody EA, a regular block 15 + is allocated and the contents moved to that block. 16 + 17 + Pending a change to compact the extended attribute key used to store 18 + inline data, one ought to be able to store 160 bytes of data in a 19 + 256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to 20 + that, the limit was 156 bytes due to inefficient use of inode space. 21 + 22 + The inline data feature requires the presence of an extended attribute 23 + for “system.data”, even if the attribute value is zero length. 24 + 25 + Inline Directories 26 + ~~~~~~~~~~~~~~~~~~ 27 + 28 + The first four bytes of i\_block are the inode number of the parent 29 + directory. Following that is a 56-byte space for an array of directory 30 + entries; see ``struct ext4_dir_entry``. If there is a “system.data” 31 + attribute in the inode body, the EA value is an array of 32 + ``struct ext4_dir_entry`` as well. Note that for inline directories, the 33 + i\_block and EA space are treated as separate dirent blocks; directory 34 + entries cannot span the two. 35 + 36 + Inline directory entries are not checksummed, as the inode checksum 37 + should protect all inline data contents.

+575

Documentation/filesystems/ext4/ondisk/inodes.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Index Nodes 4 + ----------- 5 + 6 + In a regular UNIX filesystem, the inode stores all the metadata 7 + pertaining to the file (time stamps, block maps, extended attributes, 8 + etc), not the directory entry. To find the information associated with a 9 + file, one must traverse the directory files to find the directory entry 10 + associated with a file, then load the inode to find the metadata for 11 + that file. ext4 appears to cheat (for performance reasons) a little bit 12 + by storing a copy of the file type (normally stored in the inode) in the 13 + directory entry. (Compare all this to FAT, which stores all the file 14 + information directly in the directory entry, but does not support hard 15 + links and is in general more seek-happy than ext4 due to its simpler 16 + block allocator and extensive use of linked lists.) 17 + 18 + The inode table is a linear array of ``struct ext4_inode``. The table is 19 + sized to have enough blocks to store at least 20 + ``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the 21 + block group containing an inode can be calculated as 22 + ``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the 23 + group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There 24 + is no inode 0. 25 + 26 + The inode checksum is calculated against the FS UUID, the inode number, 27 + and the inode structure itself. 28 + 29 + The inode table entry is laid out in ``struct ext4_inode``. 30 + 31 + .. list-table:: 32 + :widths: 1 1 1 77 33 + :header-rows: 1 34 + 35 + * - Offset 36 + - Size 37 + - Name 38 + - Description 39 + * - 0x0 40 + - \_\_le16 41 + - i\_mode 42 + - File mode. See the table i_mode_ below. 43 + * - 0x2 44 + - \_\_le16 45 + - i\_uid 46 + - Lower 16-bits of Owner UID. 47 + * - 0x4 48 + - \_\_le32 49 + - i\_size\_lo 50 + - Lower 32-bits of size in bytes. 51 + * - 0x8 52 + - \_\_le32 53 + - i\_atime 54 + - Last access time, in seconds since the epoch. However, if the EA\_INODE 55 + inode flag is set, this inode stores an extended attribute value and 56 + this field contains the checksum of the value. 57 + * - 0xC 58 + - \_\_le32 59 + - i\_ctime 60 + - Last inode change time, in seconds since the epoch. However, if the 61 + EA\_INODE inode flag is set, this inode stores an extended attribute 62 + value and this field contains the lower 32 bits of the attribute value's 63 + reference count. 64 + * - 0x10 65 + - \_\_le32 66 + - i\_mtime 67 + - Last data modification time, in seconds since the epoch. However, if the 68 + EA\_INODE inode flag is set, this inode stores an extended attribute 69 + value and this field contains the number of the inode that owns the 70 + extended attribute. 71 + * - 0x14 72 + - \_\_le32 73 + - i\_dtime 74 + - Deletion Time, in seconds since the epoch. 75 + * - 0x18 76 + - \_\_le16 77 + - i\_gid 78 + - Lower 16-bits of GID. 79 + * - 0x1A 80 + - \_\_le16 81 + - i\_links\_count 82 + - Hard link count. Normally, ext4 does not permit an inode to have more 83 + than 65,000 hard links. This applies to files as well as directories, 84 + which means that there cannot be more than 64,998 subdirectories in a 85 + directory (each subdirectory's '..' entry counts as a hard link, as does 86 + the '.' entry in the directory itself). With the DIR\_NLINK feature 87 + enabled, ext4 supports more than 64,998 subdirectories by setting this 88 + field to 1 to indicate that the number of hard links is not known. 89 + * - 0x1C 90 + - \_\_le32 91 + - i\_blocks\_lo 92 + - Lower 32-bits of “block” count. If the huge\_file feature flag is not 93 + set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks 94 + on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in 95 + ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi 96 + << 32)`` 512-byte blocks on disk. If huge\_file is set and 97 + EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file 98 + consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on 99 + disk. 100 + * - 0x20 101 + - \_\_le32 102 + - i\_flags 103 + - Inode flags. See the table i_flags_ below. 104 + * - 0x24 105 + - 4 bytes 106 + - i\_osd1 107 + - See the table i_osd1_ for more details. 108 + * - 0x28 109 + - 60 bytes 110 + - i\_block[EXT4\_N\_BLOCKS=15] 111 + - Block map or extent tree. See the section “The Contents of inode.i\_block”. 112 + * - 0x64 113 + - \_\_le32 114 + - i\_generation 115 + - File version (for NFS). 116 + * - 0x68 117 + - \_\_le32 118 + - i\_file\_acl\_lo 119 + - Lower 32-bits of extended attribute block. ACLs are of course one of 120 + many possible extended attributes; I think the name of this field is a 121 + result of the first use of extended attributes being for ACLs. 122 + * - 0x6C 123 + - \_\_le32 124 + - i\_size\_high / i\_dir\_acl 125 + - Upper 32-bits of file/directory size. In ext2/3 this field was named 126 + i\_dir\_acl, though it was usually set to zero and never used. 127 + * - 0x70 128 + - \_\_le32 129 + - i\_obso\_faddr 130 + - (Obsolete) fragment address. 131 + * - 0x74 132 + - 12 bytes 133 + - i\_osd2 134 + - See the table i_osd2_ for more details. 135 + * - 0x80 136 + - \_\_le16 137 + - i\_extra\_isize 138 + - Size of this inode - 128. Alternately, the size of the extended inode 139 + fields beyond the original ext2 inode, including this field. 140 + * - 0x82 141 + - \_\_le16 142 + - i\_checksum\_hi 143 + - Upper 16-bits of the inode checksum. 144 + * - 0x84 145 + - \_\_le32 146 + - i\_ctime\_extra 147 + - Extra change time bits. This provides sub-second precision. See Inode 148 + Timestamps section. 149 + * - 0x88 150 + - \_\_le32 151 + - i\_mtime\_extra 152 + - Extra modification time bits. This provides sub-second precision. 153 + * - 0x8C 154 + - \_\_le32 155 + - i\_atime\_extra 156 + - Extra access time bits. This provides sub-second precision. 157 + * - 0x90 158 + - \_\_le32 159 + - i\_crtime 160 + - File creation time, in seconds since the epoch. 161 + * - 0x94 162 + - \_\_le32 163 + - i\_crtime\_extra 164 + - Extra file creation time bits. This provides sub-second precision. 165 + * - 0x98 166 + - \_\_le32 167 + - i\_version\_hi 168 + - Upper 32-bits for version number. 169 + * - 0x9C 170 + - \_\_le32 171 + - i\_projid 172 + - Project ID. 173 + 174 + .. _i_mode: 175 + 176 + The ``i_mode`` value is a combination of the following flags: 177 + 178 + .. list-table:: 179 + :widths: 1 79 180 + :header-rows: 1 181 + 182 + * - Value 183 + - Description 184 + * - 0x1 185 + - S\_IXOTH (Others may execute) 186 + * - 0x2 187 + - S\_IWOTH (Others may write) 188 + * - 0x4 189 + - S\_IROTH (Others may read) 190 + * - 0x8 191 + - S\_IXGRP (Group members may execute) 192 + * - 0x10 193 + - S\_IWGRP (Group members may write) 194 + * - 0x20 195 + - S\_IRGRP (Group members may read) 196 + * - 0x40 197 + - S\_IXUSR (Owner may execute) 198 + * - 0x80 199 + - S\_IWUSR (Owner may write) 200 + * - 0x100 201 + - S\_IRUSR (Owner may read) 202 + * - 0x200 203 + - S\_ISVTX (Sticky bit) 204 + * - 0x400 205 + - S\_ISGID (Set GID) 206 + * - 0x800 207 + - S\_ISUID (Set UID) 208 + * - 209 + - These are mutually-exclusive file types: 210 + * - 0x1000 211 + - S\_IFIFO (FIFO) 212 + * - 0x2000 213 + - S\_IFCHR (Character device) 214 + * - 0x4000 215 + - S\_IFDIR (Directory) 216 + * - 0x6000 217 + - S\_IFBLK (Block device) 218 + * - 0x8000 219 + - S\_IFREG (Regular file) 220 + * - 0xA000 221 + - S\_IFLNK (Symbolic link) 222 + * - 0xC000 223 + - S\_IFSOCK (Socket) 224 + 225 + .. _i_flags: 226 + 227 + The ``i_flags`` field is a combination of these values: 228 + 229 + .. list-table:: 230 + :widths: 1 79 231 + :header-rows: 1 232 + 233 + * - Value 234 + - Description 235 + * - 0x1 236 + - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) 237 + * - 0x2 238 + - This file should be preserved, should undeletion be desired 239 + (EXT4\_UNRM\_FL). (not implemented) 240 + * - 0x4 241 + - File is compressed (EXT4\_COMPR\_FL). (not really implemented) 242 + * - 0x8 243 + - All writes to the file must be synchronous (EXT4\_SYNC\_FL). 244 + * - 0x10 245 + - File is immutable (EXT4\_IMMUTABLE\_FL). 246 + * - 0x20 247 + - File can only be appended (EXT4\_APPEND\_FL). 248 + * - 0x40 249 + - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). 250 + * - 0x80 251 + - Do not update access time (EXT4\_NOATIME\_FL). 252 + * - 0x100 253 + - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) 254 + * - 0x200 255 + - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) 256 + * - 0x400 257 + - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) 258 + * - 0x800 259 + - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was 260 + EXT4\_ECOMPR\_FL (compression error), which was never used. 261 + * - 0x1000 262 + - Directory has hashed indexes (EXT4\_INDEX\_FL). 263 + * - 0x2000 264 + - AFS magic directory (EXT4\_IMAGIC\_FL). 265 + * - 0x4000 266 + - File data must always be written through the journal 267 + (EXT4\_JOURNAL\_DATA\_FL). 268 + * - 0x8000 269 + - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) 270 + * - 0x10000 271 + - All directory entry data should be written synchronously (see 272 + ``dirsync``) (EXT4\_DIRSYNC\_FL). 273 + * - 0x20000 274 + - Top of directory hierarchy (EXT4\_TOPDIR\_FL). 275 + * - 0x40000 276 + - This is a huge file (EXT4\_HUGE\_FILE\_FL). 277 + * - 0x80000 278 + - Inode uses extents (EXT4\_EXTENTS\_FL). 279 + * - 0x200000 280 + - Inode stores a large extended attribute value in its data blocks 281 + (EXT4\_EA\_INODE\_FL). 282 + * - 0x400000 283 + - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). 284 + (deprecated) 285 + * - 0x01000000 286 + - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) 287 + * - 0x04000000 288 + - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in 289 + mainline) 290 + * - 0x08000000 291 + - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in 292 + mainline) 293 + * - 0x10000000 294 + - Inode has inline data (EXT4\_INLINE\_DATA\_FL). 295 + * - 0x20000000 296 + - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). 297 + * - 0x80000000 298 + - Reserved for ext4 library (EXT4\_RESERVED\_FL). 299 + * - 300 + - Aggregate flags: 301 + * - 0x4BDFFF 302 + - User-visible flags. 303 + * - 0x4B80FF 304 + - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and 305 + EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's 306 + EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of 307 + these flags in a special manner and they are masked out of the set of 308 + flags that are saved directly to i\_flags. 309 + 310 + .. _i_osd1: 311 + 312 + The ``osd1`` field has multiple meanings depending on the creator: 313 + 314 + Linux: 315 + 316 + .. list-table:: 317 + :widths: 1 1 1 77 318 + :header-rows: 1 319 + 320 + * - Offset 321 + - Size 322 + - Name 323 + - Description 324 + * - 0x0 325 + - \_\_le32 326 + - l\_i\_version 327 + - Inode version. However, if the EA\_INODE inode flag is set, this inode 328 + stores an extended attribute value and this field contains the upper 32 329 + bits of the attribute value's reference count. 330 + 331 + Hurd: 332 + 333 + .. list-table:: 334 + :widths: 1 1 1 77 335 + :header-rows: 1 336 + 337 + * - Offset 338 + - Size 339 + - Name 340 + - Description 341 + * - 0x0 342 + - \_\_le32 343 + - h\_i\_translator 344 + - ?? 345 + 346 + Masix: 347 + 348 + .. list-table:: 349 + :widths: 1 1 1 77 350 + :header-rows: 1 351 + 352 + * - Offset 353 + - Size 354 + - Name 355 + - Description 356 + * - 0x0 357 + - \_\_le32 358 + - m\_i\_reserved 359 + - ?? 360 + 361 + .. _i_osd2: 362 + 363 + The ``osd2`` field has multiple meanings depending on the filesystem creator: 364 + 365 + Linux: 366 + 367 + .. list-table:: 368 + :widths: 1 1 1 77 369 + :header-rows: 1 370 + 371 + * - Offset 372 + - Size 373 + - Name 374 + - Description 375 + * - 0x0 376 + - \_\_le16 377 + - l\_i\_blocks\_high 378 + - Upper 16-bits of the block count. Please see the note attached to 379 + i\_blocks\_lo. 380 + * - 0x2 381 + - \_\_le16 382 + - l\_i\_file\_acl\_high 383 + - Upper 16-bits of the extended attribute block (historically, the file 384 + ACL location). See the Extended Attributes section below. 385 + * - 0x4 386 + - \_\_le16 387 + - l\_i\_uid\_high 388 + - Upper 16-bits of the Owner UID. 389 + * - 0x6 390 + - \_\_le16 391 + - l\_i\_gid\_high 392 + - Upper 16-bits of the GID. 393 + * - 0x8 394 + - \_\_le16 395 + - l\_i\_checksum\_lo 396 + - Lower 16-bits of the inode checksum. 397 + * - 0xA 398 + - \_\_le16 399 + - l\_i\_reserved 400 + - Unused. 401 + 402 + Hurd: 403 + 404 + .. list-table:: 405 + :widths: 1 1 1 77 406 + :header-rows: 1 407 + 408 + * - Offset 409 + - Size 410 + - Name 411 + - Description 412 + * - 0x0 413 + - \_\_le16 414 + - h\_i\_reserved1 415 + - ?? 416 + * - 0x2 417 + - \_\_u16 418 + - h\_i\_mode\_high 419 + - Upper 16-bits of the file mode. 420 + * - 0x4 421 + - \_\_le16 422 + - h\_i\_uid\_high 423 + - Upper 16-bits of the Owner UID. 424 + * - 0x6 425 + - \_\_le16 426 + - h\_i\_gid\_high 427 + - Upper 16-bits of the GID. 428 + * - 0x8 429 + - \_\_u32 430 + - h\_i\_author 431 + - Author code? 432 + 433 + Masix: 434 + 435 + .. list-table:: 436 + :widths: 1 1 1 77 437 + :header-rows: 1 438 + 439 + * - Offset 440 + - Size 441 + - Name 442 + - Description 443 + * - 0x0 444 + - \_\_le16 445 + - h\_i\_reserved1 446 + - ?? 447 + * - 0x2 448 + - \_\_u16 449 + - m\_i\_file\_acl\_high 450 + - Upper 16-bits of the extended attribute block (historically, the file 451 + ACL location). 452 + * - 0x4 453 + - \_\_u32 454 + - m\_i\_reserved2[2] 455 + - ?? 456 + 457 + Inode Size 458 + ~~~~~~~~~~ 459 + 460 + In ext2 and ext3, the inode structure size was fixed at 128 bytes 461 + (``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of 462 + 128 bytes. Starting with ext4, it is possible to allocate a larger 463 + on-disk inode at format time for all inodes in the filesystem to provide 464 + space beyond the end of the original ext2 inode. The on-disk inode 465 + record size is recorded in the superblock as ``s_inode_size``. The 466 + number of bytes actually used by struct ext4\_inode beyond the original 467 + 128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each 468 + inode, which allows struct ext4\_inode to grow for a new kernel without 469 + having to upgrade all of the on-disk inodes. Access to fields beyond 470 + EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within 471 + ``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as 472 + of October 2013) the inode structure is 156 bytes 473 + (``i_extra_isize = 28``). The extra space between the end of the inode 474 + structure and the end of the inode record can be used to store extended 475 + attributes. Each inode record can be as large as the filesystem block 476 + size, though this is not terribly efficient. 477 + 478 + Finding an Inode 479 + ~~~~~~~~~~~~~~~~ 480 + 481 + Each block group contains ``sb->s_inodes_per_group`` inodes. Because 482 + inode 0 is defined not to exist, this formula can be used to find the 483 + block group that an inode lives in: 484 + ``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode 485 + can be found within the block group's inode table at 486 + ``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte 487 + address within the inode table, use 488 + ``offset = index * sb->s_inode_size``. 489 + 490 + Inode Timestamps 491 + ~~~~~~~~~~~~~~~~ 492 + 493 + Four timestamps are recorded in the lower 128 bytes of the inode 494 + structure -- inode change time (ctime), access time (atime), data 495 + modification time (mtime), and deletion time (dtime). The four fields 496 + are 32-bit signed integers that represent seconds since the Unix epoch 497 + (1970-01-01 00:00:00 GMT), which means that the fields will overflow in 498 + January 2038. For inodes that are not linked from any directory but are 499 + still open (orphan inodes), the dtime field is overloaded for use with 500 + the orphan list. The superblock field ``s_last_orphan`` points to the 501 + first inode in the orphan list; dtime is then the number of the next 502 + orphaned inode, or zero if there are no more orphans. 503 + 504 + If the inode structure size ``sb->s_inode_size`` is larger than 128 505 + bytes and the ``i_inode_extra`` field is large enough to encompass the 506 + respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime 507 + inode fields are widened to 64 bits. Within this “extra” 32-bit field, 508 + the lower two bits are used to extend the 32-bit seconds field to be 34 509 + bit wide; the upper 30 bits are used to provide nanosecond timestamp 510 + accuracy. Therefore, timestamps should not overflow until May 2446. 511 + dtime was not widened. There is also a fifth timestamp to record inode 512 + creation time (crtime); this field is 64-bits wide and decoded in the 513 + same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible 514 + through the regular stat() interface, though debugfs will report them. 515 + 516 + We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). 517 + In other words: 518 + 519 + .. list-table:: 520 + :widths: 20 20 20 20 20 521 + :header-rows: 1 522 + 523 + * - Extra epoch bits 524 + - MSB of 32-bit time 525 + - Adjustment for signed 32-bit to 64-bit tv\_sec 526 + - Decoded 64-bit tv\_sec 527 + - valid time range 528 + * - 0 0 529 + - 1 530 + - 0 531 + - ``-0x80000000 - -0x00000001`` 532 + - 1901-12-13 to 1969-12-31 533 + * - 0 0 534 + - 0 535 + - 0 536 + - ``0x000000000 - 0x07fffffff`` 537 + - 1970-01-01 to 2038-01-19 538 + * - 0 1 539 + - 1 540 + - 0x100000000 541 + - ``0x080000000 - 0x0ffffffff`` 542 + - 2038-01-19 to 2106-02-07 543 + * - 0 1 544 + - 0 545 + - 0x100000000 546 + - ``0x100000000 - 0x17fffffff`` 547 + - 2106-02-07 to 2174-02-25 548 + * - 1 0 549 + - 1 550 + - 0x200000000 551 + - ``0x180000000 - 0x1ffffffff`` 552 + - 2174-02-25 to 2242-03-16 553 + * - 1 0 554 + - 0 555 + - 0x200000000 556 + - ``0x200000000 - 0x27fffffff`` 557 + - 2242-03-16 to 2310-04-04 558 + * - 1 1 559 + - 1 560 + - 0x300000000 561 + - ``0x280000000 - 0x2ffffffff`` 562 + - 2310-04-04 to 2378-04-22 563 + * - 1 1 564 + - 0 565 + - 0x300000000 566 + - ``0x300000000 - 0x37fffffff`` 567 + - 2378-04-22 to 2446-05-10 568 + 569 + This is a somewhat odd encoding since there are effectively seven times 570 + as many positive values as negative values. There have also been 571 + long-standing bugs decoding and encoding dates beyond 2038, which don't 572 + seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels 573 + incorrectly use the extra epoch bits 1,1 for dates between 1901 and 574 + 1970. At some point the kernel will be fixed and e2fsck will fix this 575 + situation, assuming that it is run before 2310.

+611

Documentation/filesystems/ext4/ondisk/journal.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Journal (jbd2) 4 + -------------- 5 + 6 + Introduced in ext3, the ext4 filesystem employs a journal to protect the 7 + filesystem against corruption in the case of a system crash. A small 8 + continuous region of disk (default 128MiB) is reserved inside the 9 + filesystem as a place to land “important” data writes on-disk as quickly 10 + as possible. Once the important data transaction is fully written to the 11 + disk and flushed from the disk write cache, a record of the data being 12 + committed is also written to the journal. At some later point in time, 13 + the journal code writes the transactions to their final locations on 14 + disk (this could involve a lot of seeking or a lot of small 15 + read-write-erases) before erasing the commit record. Should the system 16 + crash during the second slow write, the journal can be replayed all the 17 + way to the latest commit record, guaranteeing the atomicity of whatever 18 + gets written through the journal to the disk. The effect of this is to 19 + guarantee that the filesystem does not become stuck midway through a 20 + metadata update. 21 + 22 + For performance reasons, ext4 by default only writes filesystem metadata 23 + through the journal. This means that file data blocks are /not/ 24 + guaranteed to be in any consistent state after a crash. If this default 25 + guarantee level (``data=ordered``) is not satisfactory, there is a mount 26 + option to control journal behavior. If ``data=journal``, all data and 27 + metadata are written to disk through the journal. This is slower but 28 + safest. If ``data=writeback``, dirty data blocks are not flushed to the 29 + disk before the metadata are written to disk through the journal. 30 + 31 + The journal inode is typically inode 8. The first 68 bytes of the 32 + journal inode are replicated in the ext4 superblock. The journal itself 33 + is normal (but hidden) file within the filesystem. The file usually 34 + consumes an entire block group, though mke2fs tries to put it in the 35 + middle of the disk. 36 + 37 + All fields in jbd2 are written to disk in big-endian order. This is the 38 + opposite of ext4. 39 + 40 + NOTE: Both ext4 and ocfs2 use jbd2. 41 + 42 + The maximum size of a journal embedded in an ext4 filesystem is 2^32 43 + blocks. jbd2 itself does not seem to care. 44 + 45 + Layout 46 + ~~~~~~ 47 + 48 + Generally speaking, the journal has this format: 49 + 50 + .. list-table:: 51 + :widths: 1 1 78 52 + :header-rows: 1 53 + 54 + * - Superblock 55 + - descriptor\_block (data\_blocks or revocation\_block) [more data or 56 + revocations] commmit\_block 57 + - [more transactions...] 58 + * - 59 + - One transaction 60 + - 61 + 62 + Notice that a transaction begins with either a descriptor and some data, 63 + or a block revocation list. A finished transaction always ends with a 64 + commit. If there is no commit record (or the checksums don't match), the 65 + transaction will be discarded during replay. 66 + 67 + External Journal 68 + ~~~~~~~~~~~~~~~~ 69 + 70 + Optionally, an ext4 filesystem can be created with an external journal 71 + device (as opposed to an internal journal, which uses a reserved inode). 72 + In this case, on the filesystem device, ``s_journal_inum`` should be 73 + zero and ``s_journal_uuid`` should be set. On the journal device there 74 + will be an ext4 super block in the usual place, with a matching UUID. 75 + The journal superblock will be in the next full block after the 76 + superblock. 77 + 78 + .. list-table:: 79 + :widths: 1 1 1 1 76 80 + :header-rows: 1 81 + 82 + * - 1024 bytes of padding 83 + - ext4 Superblock 84 + - Journal Superblock 85 + - descriptor\_block (data\_blocks or revocation\_block) [more data or 86 + revocations] commmit\_block 87 + - [more transactions...] 88 + * - 89 + - 90 + - 91 + - One transaction 92 + - 93 + 94 + Block Header 95 + ~~~~~~~~~~~~ 96 + 97 + Every block in the journal starts with a common 12-byte header 98 + ``struct journal_header_s``: 99 + 100 + .. list-table:: 101 + :widths: 1 1 1 77 102 + :header-rows: 1 103 + 104 + * - Offset 105 + - Type 106 + - Name 107 + - Description 108 + * - 0x0 109 + - \_\_be32 110 + - h\_magic 111 + - jbd2 magic number, 0xC03B3998. 112 + * - 0x4 113 + - \_\_be32 114 + - h\_blocktype 115 + - Description of what this block contains. See the jbd2_blocktype_ table 116 + below. 117 + * - 0x8 118 + - \_\_be32 119 + - h\_sequence 120 + - The transaction ID that goes with this block. 121 + 122 + .. _jbd2_blocktype: 123 + 124 + The journal block type can be any one of: 125 + 126 + .. list-table:: 127 + :widths: 1 79 128 + :header-rows: 1 129 + 130 + * - Value 131 + - Description 132 + * - 1 133 + - Descriptor. This block precedes a series of data blocks that were 134 + written through the journal during a transaction. 135 + * - 2 136 + - Block commit record. This block signifies the completion of a 137 + transaction. 138 + * - 3 139 + - Journal superblock, v1. 140 + * - 4 141 + - Journal superblock, v2. 142 + * - 5 143 + - Block revocation records. This speeds up recovery by enabling the 144 + journal to skip writing blocks that were subsequently rewritten. 145 + 146 + Super Block 147 + ~~~~~~~~~~~ 148 + 149 + The super block for the journal is much simpler as compared to ext4's. 150 + The key data kept within are size of the journal, and where to find the 151 + start of the log of transactions. 152 + 153 + The journal superblock is recorded as ``struct journal_superblock_s``, 154 + which is 1024 bytes long: 155 + 156 + .. list-table:: 157 + :widths: 1 1 1 77 158 + :header-rows: 1 159 + 160 + * - Offset 161 + - Type 162 + - Name 163 + - Description 164 + * - 165 + - 166 + - 167 + - Static information describing the journal. 168 + * - 0x0 169 + - journal\_header\_t (12 bytes) 170 + - s\_header 171 + - Common header identifying this as a superblock. 172 + * - 0xC 173 + - \_\_be32 174 + - s\_blocksize 175 + - Journal device block size. 176 + * - 0x10 177 + - \_\_be32 178 + - s\_maxlen 179 + - Total number of blocks in this journal. 180 + * - 0x14 181 + - \_\_be32 182 + - s\_first 183 + - First block of log information. 184 + * - 185 + - 186 + - 187 + - Dynamic information describing the current state of the log. 188 + * - 0x18 189 + - \_\_be32 190 + - s\_sequence 191 + - First commit ID expected in log. 192 + * - 0x1C 193 + - \_\_be32 194 + - s\_start 195 + - Block number of the start of log. Contrary to the comments, this field 196 + being zero does not imply that the journal is clean! 197 + * - 0x20 198 + - \_\_be32 199 + - s\_errno 200 + - Error value, as set by jbd2\_journal\_abort(). 201 + * - 202 + - 203 + - 204 + - The remaining fields are only valid in a v2 superblock. 205 + * - 0x24 206 + - \_\_be32 207 + - s\_feature\_compat; 208 + - Compatible feature set. See the table jbd2_compat_ below. 209 + * - 0x28 210 + - \_\_be32 211 + - s\_feature\_incompat 212 + - Incompatible feature set. See the table jbd2_incompat_ below. 213 + * - 0x2C 214 + - \_\_be32 215 + - s\_feature\_ro\_compat 216 + - Read-only compatible feature set. There aren't any of these currently. 217 + * - 0x30 218 + - \_\_u8 219 + - s\_uuid[16] 220 + - 128-bit uuid for journal. This is compared against the copy in the ext4 221 + super block at mount time. 222 + * - 0x40 223 + - \_\_be32 224 + - s\_nr\_users 225 + - Number of file systems sharing this journal. 226 + * - 0x44 227 + - \_\_be32 228 + - s\_dynsuper 229 + - Location of dynamic super block copy. (Not used?) 230 + * - 0x48 231 + - \_\_be32 232 + - s\_max\_transaction 233 + - Limit of journal blocks per transaction. (Not used?) 234 + * - 0x4C 235 + - \_\_be32 236 + - s\_max\_trans\_data 237 + - Limit of data blocks per transaction. (Not used?) 238 + * - 0x50 239 + - \_\_u8 240 + - s\_checksum\_type 241 + - Checksum algorithm used for the journal. See jbd2_checksum_type_ for 242 + more info. 243 + * - 0x51 244 + - \_\_u8[3] 245 + - s\_padding2 246 + - 247 + * - 0x54 248 + - \_\_u32 249 + - s\_padding[42] 250 + - 251 + * - 0xFC 252 + - \_\_be32 253 + - s\_checksum 254 + - Checksum of the entire superblock, with this field set to zero. 255 + * - 0x100 256 + - \_\_u8 257 + - s\_users[16\*48] 258 + - ids of all file systems sharing the log. e2fsprogs/Linux don't allow 259 + shared external journals, but I imagine Lustre (or ocfs2?), which use 260 + the jbd2 code, might. 261 + 262 + .. _jbd2_compat: 263 + 264 + The journal compat features are any combination of the following: 265 + 266 + .. list-table:: 267 + :widths: 1 79 268 + :header-rows: 1 269 + 270 + * - Value 271 + - Description 272 + * - 0x1 273 + - Journal maintains checksums on the data blocks. 274 + (JBD2\_FEATURE\_COMPAT\_CHECKSUM) 275 + 276 + .. _jbd2_incompat: 277 + 278 + The journal incompat features are any combination of the following: 279 + 280 + .. list-table:: 281 + :widths: 1 79 282 + :header-rows: 1 283 + 284 + * - Value 285 + - Description 286 + * - 0x1 287 + - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) 288 + * - 0x2 289 + - Journal can deal with 64-bit block numbers. 290 + (JBD2\_FEATURE\_INCOMPAT\_64BIT) 291 + * - 0x4 292 + - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) 293 + * - 0x8 294 + - This journal uses v2 of the checksum on-disk format. Each journal 295 + metadata block gets its own checksum, and the block tags in the 296 + descriptor table contain checksums for each of the data blocks in the 297 + journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) 298 + * - 0x10 299 + - This journal uses v3 of the checksum on-disk format. This is the same as 300 + v2, but the journal block tag size is fixed regardless of the size of 301 + block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) 302 + 303 + .. _jbd2_checksum_type: 304 + 305 + Journal checksum type codes are one of the following. crc32 or crc32c are the 306 + most likely choices. 307 + 308 + .. list-table:: 309 + :widths: 1 79 310 + :header-rows: 1 311 + 312 + * - Value 313 + - Description 314 + * - 1 315 + - CRC32 316 + * - 2 317 + - MD5 318 + * - 3 319 + - SHA1 320 + * - 4 321 + - CRC32C 322 + 323 + Descriptor Block 324 + ~~~~~~~~~~~~~~~~ 325 + 326 + The descriptor block contains an array of journal block tags that 327 + describe the final locations of the data blocks that follow in the 328 + journal. Descriptor blocks are open-coded instead of being completely 329 + described by a data structure, but here is the block structure anyway. 330 + Descriptor blocks consume at least 36 bytes, but use a full block: 331 + 332 + .. list-table:: 333 + :widths: 1 1 1 77 334 + :header-rows: 1 335 + 336 + * - Offset 337 + - Type 338 + - Name 339 + - Descriptor 340 + * - 0x0 341 + - journal\_header\_t 342 + - (open coded) 343 + - Common block header. 344 + * - 0xC 345 + - struct journal\_block\_tag\_s 346 + - open coded array[] 347 + - Enough tags either to fill up the block or to describe all the data 348 + blocks that follow this descriptor block. 349 + 350 + Journal block tags have any of the following formats, depending on which 351 + journal feature and block tag flags are set. 352 + 353 + If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is 354 + defined as ``struct journal_block_tag3_s``, which looks like the 355 + following. The size is 16 or 32 bytes. 356 + 357 + .. list-table:: 358 + :widths: 1 1 1 77 359 + :header-rows: 1 360 + 361 + * - Offset 362 + - Type 363 + - Name 364 + - Descriptor 365 + * - 0x0 366 + - \_\_be32 367 + - t\_blocknr 368 + - Lower 32-bits of the location of where the corresponding data block 369 + should end up on disk. 370 + * - 0x4 371 + - \_\_be32 372 + - t\_flags 373 + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for 374 + more info. 375 + * - 0x8 376 + - \_\_be32 377 + - t\_blocknr\_high 378 + - Upper 32-bits of the location of where the corresponding data block 379 + should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is 380 + not enabled. 381 + * - 0xC 382 + - \_\_be32 383 + - t\_checksum 384 + - Checksum of the journal UUID, the sequence number, and the data block. 385 + * - 386 + - 387 + - 388 + - This field appears to be open coded. It always comes at the end of the 389 + tag, after t_checksum. This field is not present if the "same UUID" flag 390 + is set. 391 + * - 0x8 or 0xC 392 + - char 393 + - uuid[16] 394 + - A UUID to go with this tag. This field appears to be copied from the 395 + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that 396 + field. 397 + 398 + .. _jbd2_tag_flags: 399 + 400 + The journal tag flags are any combination of the following: 401 + 402 + .. list-table:: 403 + :widths: 1 79 404 + :header-rows: 1 405 + 406 + * - Value 407 + - Description 408 + * - 0x1 409 + - On-disk block is escaped. The first four bytes of the data block just 410 + happened to match the jbd2 magic number. 411 + * - 0x2 412 + - This block has the same UUID as previous, therefore the UUID field is 413 + omitted. 414 + * - 0x4 415 + - The data block was deleted by the transaction. (Not used?) 416 + * - 0x8 417 + - This is the last tag in this descriptor block. 418 + 419 + If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag 420 + is defined as ``struct journal_block_tag_s``, which looks like the 421 + following. The size is 8, 12, 24, or 28 bytes: 422 + 423 + .. list-table:: 424 + :widths: 1 1 1 77 425 + :header-rows: 1 426 + 427 + * - Offset 428 + - Type 429 + - Name 430 + - Descriptor 431 + * - 0x0 432 + - \_\_be32 433 + - t\_blocknr 434 + - Lower 32-bits of the location of where the corresponding data block 435 + should end up on disk. 436 + * - 0x4 437 + - \_\_be16 438 + - t\_checksum 439 + - Checksum of the journal UUID, the sequence number, and the data block. 440 + Note that only the lower 16 bits are stored. 441 + * - 0x6 442 + - \_\_be16 443 + - t\_flags 444 + - Flags that go with the descriptor. See the table jbd2_tag_flags_ for 445 + more info. 446 + * - 447 + - 448 + - 449 + - This next field is only present if the super block indicates support for 450 + 64-bit block numbers. 451 + * - 0x8 452 + - \_\_be32 453 + - t\_blocknr\_high 454 + - Upper 32-bits of the location of where the corresponding data block 455 + should end up on disk. 456 + * - 457 + - 458 + - 459 + - This field appears to be open coded. It always comes at the end of the 460 + tag, after t_flags or t_blocknr_high. This field is not present if the 461 + "same UUID" flag is set. 462 + * - 0x8 or 0xC 463 + - char 464 + - uuid[16] 465 + - A UUID to go with this tag. This field appears to be copied from the 466 + ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that 467 + field. 468 + 469 + If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or 470 + JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a 471 + ``struct jbd2_journal_block_tail``, which looks like this: 472 + 473 + .. list-table:: 474 + :widths: 1 1 1 77 475 + :header-rows: 1 476 + 477 + * - Offset 478 + - Type 479 + - Name 480 + - Descriptor 481 + * - 0x0 482 + - \_\_be32 483 + - t\_checksum 484 + - Checksum of the journal UUID + the descriptor block, with this field set 485 + to zero. 486 + 487 + Data Block 488 + ~~~~~~~~~~ 489 + 490 + In general, the data blocks being written to disk through the journal 491 + are written verbatim into the journal file after the descriptor block. 492 + However, if the first four bytes of the block match the jbd2 magic 493 + number then those four bytes are replaced with zeroes and the “escaped” 494 + flag is set in the descriptor block tag. 495 + 496 + Revocation Block 497 + ~~~~~~~~~~~~~~~~ 498 + 499 + A revocation block is used to prevent replay of a block in an earlier 500 + transaction. This is used to mark blocks that were journalled at one 501 + time but are no longer journalled. Typically this happens if a metadata 502 + block is freed and re-allocated as a file data block; in this case, a 503 + journal replay after the file block was written to disk will cause 504 + corruption. 505 + 506 + **NOTE**: This mechanism is NOT used to express “this journal block is 507 + superseded by this other journal block”, as the author (djwong) 508 + mistakenly thought. Any block being added to a transaction will cause 509 + the removal of all existing revocation records for that block. 510 + 511 + Revocation blocks are described in 512 + ``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in 513 + length, but use a full block: 514 + 515 + .. list-table:: 516 + :widths: 1 1 1 77 517 + :header-rows: 1 518 + 519 + * - Offset 520 + - Type 521 + - Name 522 + - Description 523 + * - 0x0 524 + - journal\_header\_t 525 + - r\_header 526 + - Common block header. 527 + * - 0xC 528 + - \_\_be32 529 + - r\_count 530 + - Number of bytes used in this block. 531 + * - 0x10 532 + - \_\_be32 or \_\_be64 533 + - blocks[0] 534 + - Blocks to revoke. 535 + 536 + After r\_count is a linear array of block numbers that are effectively 537 + revoked by this transaction. The size of each block number is 8 bytes if 538 + the superblock advertises 64-bit block number support, or 4 bytes 539 + otherwise. 540 + 541 + If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or 542 + JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation 543 + block is a ``struct jbd2_journal_revoke_tail``, which has this format: 544 + 545 + .. list-table:: 546 + :widths: 1 1 1 77 547 + :header-rows: 1 548 + 549 + * - Offset 550 + - Type 551 + - Name 552 + - Description 553 + * - 0x0 554 + - \_\_be32 555 + - r\_checksum 556 + - Checksum of the journal UUID + revocation block 557 + 558 + Commit Block 559 + ~~~~~~~~~~~~ 560 + 561 + The commit block is a sentry that indicates that a transaction has been 562 + completely written to the journal. Once this commit block reaches the 563 + journal, the data stored with this transaction can be written to their 564 + final locations on disk. 565 + 566 + The commit block is described by ``struct commit_header``, which is 32 567 + bytes long (but uses a full block): 568 + 569 + .. list-table:: 570 + :widths: 1 1 1 77 571 + :header-rows: 1 572 + 573 + * - Offset 574 + - Type 575 + - Name 576 + - Descriptor 577 + * - 0x0 578 + - journal\_header\_s 579 + - (open coded) 580 + - Common block header. 581 + * - 0xC 582 + - unsigned char 583 + - h\_chksum\_type 584 + - The type of checksum to use to verify the integrity of the data blocks 585 + in the transaction. See jbd2_checksum_type_ for more info. 586 + * - 0xD 587 + - unsigned char 588 + - h\_chksum\_size 589 + - The number of bytes used by the checksum. Most likely 4. 590 + * - 0xE 591 + - unsigned char 592 + - h\_padding[2] 593 + - 594 + * - 0x10 595 + - \_\_be32 596 + - h\_chksum[JBD2\_CHECKSUM\_BYTES] 597 + - 32 bytes of space to store checksums. If 598 + JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 599 + are set, the first ``__be32`` is the checksum of the journal UUID and 600 + the entire commit block, with this field zeroed. If 601 + JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the 602 + crc32 of all the blocks already written to the transaction. 603 + * - 0x30 604 + - \_\_be64 605 + - h\_commit\_sec 606 + - The time that the transaction was committed, in seconds since the epoch. 607 + * - 0x38 608 + - \_\_be32 609 + - h\_commit\_nsec 610 + - Nanoseconds component of the above timestamp. 611 +

+77

Documentation/filesystems/ext4/ondisk/mmp.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Multiple Mount Protection 4 + ------------------------- 5 + 6 + Multiple mount protection (MMP) is a feature that protects the 7 + filesystem against multiple hosts trying to use the filesystem 8 + simultaneously. When a filesystem is opened (for mounting, or fsck, 9 + etc.), the MMP code running on the node (call it node A) checks a 10 + sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the 11 + open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then 12 + fsck is (hopefully) running, and open fails immediately. Otherwise, the 13 + open code will wait for twice the specified MMP check interval and check 14 + the sequence number again. If the sequence number has changed, then the 15 + filesystem is active on another machine and the open fails. If the MMP 16 + code passes all of those checks, a new MMP sequence number is generated 17 + and written to the MMP block, and the mount proceeds. 18 + 19 + While the filesystem is live, the kernel sets up a timer to re-check the 20 + MMP block at the specified MMP check interval. To perform the re-check, 21 + the MMP sequence number is re-read; if it does not match the in-memory 22 + MMP sequence number, then another node (node B) has mounted the 23 + filesystem, and node A remounts the filesystem read-only. If the 24 + sequence numbers match, the sequence number is incremented both in 25 + memory and on disk, and the re-check is complete. 26 + 27 + The hostname and device filename are written into the MMP block whenever 28 + an open operation succeeds. The MMP code does not use these values; they 29 + are provided purely for informational purposes. 30 + 31 + The checksum is calculated against the FS UUID and the MMP structure. 32 + The MMP structure (``struct mmp_struct``) is as follows: 33 + 34 + .. list-table:: 35 + :widths: 1 1 1 77 36 + :header-rows: 1 37 + 38 + * - Offset 39 + - Type 40 + - Name 41 + - Description 42 + * - 0x0 43 + - \_\_le32 44 + - mmp\_magic 45 + - Magic number for MMP, 0x004D4D50 (“MMP”). 46 + * - 0x4 47 + - \_\_le32 48 + - mmp\_seq 49 + - Sequence number, updated periodically. 50 + * - 0x8 51 + - \_\_le64 52 + - mmp\_time 53 + - Time that the MMP block was last updated. 54 + * - 0x10 55 + - char[64] 56 + - mmp\_nodename 57 + - Hostname of the node that opened the filesystem. 58 + * - 0x50 59 + - char[32] 60 + - mmp\_bdevname 61 + - Block device name of the filesystem. 62 + * - 0x70 63 + - \_\_le16 64 + - mmp\_check\_interval 65 + - The MMP re-check interval, in seconds. 66 + * - 0x72 67 + - \_\_le16 68 + - mmp\_pad1 69 + - Zero. 70 + * - 0x74 71 + - \_\_le32[226] 72 + - mmp\_pad2 73 + - Zero. 74 + * - 0x3FC 75 + - \_\_le32 76 + - mmp\_checksum 77 + - Checksum of the MMP block.

+26

Documentation/filesystems/ext4/ondisk/overview.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + High Level Design 4 + ================= 5 + 6 + An ext4 file system is split into a series of block groups. To reduce 7 + performance difficulties due to fragmentation, the block allocator tries 8 + very hard to keep each file's blocks within the same group, thereby 9 + reducing seek times. The size of a block group is specified in 10 + ``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* 11 + ``block_size_in_bytes``. With the default block size of 4KiB, each group 12 + will contain 32,768 blocks, for a length of 128MiB. The number of block 13 + groups is the size of the device divided by the size of a block group. 14 + 15 + All fields in ext4 are written to disk in little-endian order. HOWEVER, 16 + all fields in jbd2 (the journal) are written to disk in big-endian 17 + order. 18 + 19 + .. include:: blocks.rst 20 + .. include:: blockgroup.rst 21 + .. include:: special_inodes.rst 22 + .. include:: allocators.rst 23 + .. include:: checksums.rst 24 + .. include:: bigalloc.rst 25 + .. include:: inlinedata.rst 26 + .. include:: eainode.rst

+38

Documentation/filesystems/ext4/ondisk/special_inodes.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Special inodes 4 + -------------- 5 + 6 + ext4 reserves some inode for special features, as follows: 7 + 8 + .. list-table:: 9 + :widths: 1 79 10 + :header-rows: 1 11 + 12 + * - inode Number 13 + - Purpose 14 + * - 0 15 + - Doesn't exist; there is no inode 0. 16 + * - 1 17 + - List of defective blocks. 18 + * - 2 19 + - Root directory. 20 + * - 3 21 + - User quota. 22 + * - 4 23 + - Group quota. 24 + * - 5 25 + - Boot loader. 26 + * - 6 27 + - Undelete directory. 28 + * - 7 29 + - Reserved group descriptors inode. (“resize inode”) 30 + * - 8 31 + - Journal inode. 32 + * - 9 33 + - The “exclude” inode, for snapshots(?) 34 + * - 10 35 + - Replica inode, used for some non-upstream feature? 36 + * - 11 37 + - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. 38 +

+801

Documentation/filesystems/ext4/ondisk/super.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + Super Block 4 + ----------- 5 + 6 + The superblock records various information about the enclosing 7 + filesystem, such as block counts, inode counts, supported features, 8 + maintenance information, and more. 9 + 10 + If the sparse\_super feature flag is set, redundant copies of the 11 + superblock and group descriptors are kept only in the groups whose group 12 + number is either 0 or a power of 3, 5, or 7. If the flag is not set, 13 + redundant copies are kept in all groups. 14 + 15 + The superblock checksum is calculated against the superblock structure, 16 + which includes the FS UUID. 17 + 18 + The ext4 superblock is laid out as follows in 19 + ``struct ext4_super_block``: 20 + 21 + .. list-table:: 22 + :widths: 1 1 1 77 23 + :header-rows: 1 24 + 25 + * - Offset 26 + - Size 27 + - Name 28 + - Description 29 + * - 0x0 30 + - \_\_le32 31 + - s\_inodes\_count 32 + - Total inode count. 33 + * - 0x4 34 + - \_\_le32 35 + - s\_blocks\_count\_lo 36 + - Total block count. 37 + * - 0x8 38 + - \_\_le32 39 + - s\_r\_blocks\_count\_lo 40 + - This number of blocks can only be allocated by the super-user. 41 + * - 0xC 42 + - \_\_le32 43 + - s\_free\_blocks\_count\_lo 44 + - Free block count. 45 + * - 0x10 46 + - \_\_le32 47 + - s\_free\_inodes\_count 48 + - Free inode count. 49 + * - 0x14 50 + - \_\_le32 51 + - s\_first\_data\_block 52 + - First data block. This must be at least 1 for 1k-block filesystems and 53 + is typically 0 for all other block sizes. 54 + * - 0x18 55 + - \_\_le32 56 + - s\_log\_block\_size 57 + - Block size is 2 ^ (10 + s\_log\_block\_size). 58 + * - 0x1C 59 + - \_\_le32 60 + - s\_log\_cluster\_size 61 + - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is 62 + enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. 63 + * - 0x20 64 + - \_\_le32 65 + - s\_blocks\_per\_group 66 + - Blocks per group. 67 + * - 0x24 68 + - \_\_le32 69 + - s\_clusters\_per\_group 70 + - Clusters per group, if bigalloc is enabled. Otherwise 71 + s\_clusters\_per\_group must equal s\_blocks\_per\_group. 72 + * - 0x28 73 + - \_\_le32 74 + - s\_inodes\_per\_group 75 + - Inodes per group. 76 + * - 0x2C 77 + - \_\_le32 78 + - s\_mtime 79 + - Mount time, in seconds since the epoch. 80 + * - 0x30 81 + - \_\_le32 82 + - s\_wtime 83 + - Write time, in seconds since the epoch. 84 + * - 0x34 85 + - \_\_le16 86 + - s\_mnt\_count 87 + - Number of mounts since the last fsck. 88 + * - 0x36 89 + - \_\_le16 90 + - s\_max\_mnt\_count 91 + - Number of mounts beyond which a fsck is needed. 92 + * - 0x38 93 + - \_\_le16 94 + - s\_magic 95 + - Magic signature, 0xEF53 96 + * - 0x3A 97 + - \_\_le16 98 + - s\_state 99 + - File system state. See super_state_ for more info. 100 + * - 0x3C 101 + - \_\_le16 102 + - s\_errors 103 + - Behaviour when detecting errors. See super_errors_ for more info. 104 + * - 0x3E 105 + - \_\_le16 106 + - s\_minor\_rev\_level 107 + - Minor revision level. 108 + * - 0x40 109 + - \_\_le32 110 + - s\_lastcheck 111 + - Time of last check, in seconds since the epoch. 112 + * - 0x44 113 + - \_\_le32 114 + - s\_checkinterval 115 + - Maximum time between checks, in seconds. 116 + * - 0x48 117 + - \_\_le32 118 + - s\_creator\_os 119 + - Creator OS. See the table super_creator_ for more info. 120 + * - 0x4C 121 + - \_\_le32 122 + - s\_rev\_level 123 + - Revision level. See the table super_revision_ for more info. 124 + * - 0x50 125 + - \_\_le16 126 + - s\_def\_resuid 127 + - Default uid for reserved blocks. 128 + * - 0x52 129 + - \_\_le16 130 + - s\_def\_resgid 131 + - Default gid for reserved blocks. 132 + * - 133 + - 134 + - 135 + - These fields are for EXT4_DYNAMIC_REV superblocks only. 136 + 137 + Note: the difference between the compatible feature set and the 138 + incompatible feature set is that if there is a bit set in the 139 + incompatible feature set that the kernel doesn't know about, it should 140 + refuse to mount the filesystem. 141 + 142 + e2fsck's requirements are more strict; if it doesn't know 143 + about a feature in either the compatible or incompatible feature set, it 144 + must abort and not try to meddle with things it doesn't understand... 145 + * - 0x54 146 + - \_\_le32 147 + - s\_first\_ino 148 + - First non-reserved inode. 149 + * - 0x58 150 + - \_\_le16 151 + - s\_inode\_size 152 + - Size of inode structure, in bytes. 153 + * - 0x5A 154 + - \_\_le16 155 + - s\_block\_group\_nr 156 + - Block group # of this superblock. 157 + * - 0x5C 158 + - \_\_le32 159 + - s\_feature\_compat 160 + - Compatible feature set flags. Kernel can still read/write this fs even 161 + if it doesn't understand a flag; fsck should not do that. See the 162 + super_compat_ table for more info. 163 + * - 0x60 164 + - \_\_le32 165 + - s\_feature\_incompat 166 + - Incompatible feature set. If the kernel or fsck doesn't understand one 167 + of these bits, it should stop. See the super_incompat_ table for more 168 + info. 169 + * - 0x64 170 + - \_\_le32 171 + - s\_feature\_ro\_compat 172 + - Readonly-compatible feature set. If the kernel doesn't understand one of 173 + these bits, it can still mount read-only. See the super_rocompat_ table 174 + for more info. 175 + * - 0x68 176 + - \_\_u8 177 + - s\_uuid[16] 178 + - 128-bit UUID for volume. 179 + * - 0x78 180 + - char 181 + - s\_volume\_name[16] 182 + - Volume label. 183 + * - 0x88 184 + - char 185 + - s\_last\_mounted[64] 186 + - Directory where filesystem was last mounted. 187 + * - 0xC8 188 + - \_\_le32 189 + - s\_algorithm\_usage\_bitmap 190 + - For compression (Not used in e2fsprogs/Linux) 191 + * - 192 + - 193 + - 194 + - Performance hints. Directory preallocation should only happen if the 195 + EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. 196 + * - 0xCC 197 + - \_\_u8 198 + - s\_prealloc\_blocks 199 + - #. of blocks to try to preallocate for ... files? (Not used in 200 + e2fsprogs/Linux) 201 + * - 0xCD 202 + - \_\_u8 203 + - s\_prealloc\_dir\_blocks 204 + - #. of blocks to preallocate for directories. (Not used in 205 + e2fsprogs/Linux) 206 + * - 0xCE 207 + - \_\_le16 208 + - s\_reserved\_gdt\_blocks 209 + - Number of reserved GDT entries for future filesystem expansion. 210 + * - 211 + - 212 + - 213 + - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is 214 + set. 215 + * - 0xD0 216 + - \_\_u8 217 + - s\_journal\_uuid[16] 218 + - UUID of journal superblock 219 + * - 0xE0 220 + - \_\_le32 221 + - s\_journal\_inum 222 + - inode number of journal file. 223 + * - 0xE4 224 + - \_\_le32 225 + - s\_journal\_dev 226 + - Device number of journal file, if the external journal feature flag is 227 + set. 228 + * - 0xE8 229 + - \_\_le32 230 + - s\_last\_orphan 231 + - Start of list of orphaned inodes to delete. 232 + * - 0xEC 233 + - \_\_le32 234 + - s\_hash\_seed[4] 235 + - HTREE hash seed. 236 + * - 0xFC 237 + - \_\_u8 238 + - s\_def\_hash\_version 239 + - Default hash algorithm to use for directory hashes. See super_def_hash_ 240 + for more info. 241 + * - 0xFD 242 + - \_\_u8 243 + - s\_jnl\_backup\_type 244 + - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the 245 + ``s_jnl_blocks`` field contains a duplicate copy of the inode's 246 + ``i_block[]`` array and ``i_size``. 247 + * - 0xFE 248 + - \_\_le16 249 + - s\_desc\_size 250 + - Size of group descriptors, in bytes, if the 64bit incompat feature flag 251 + is set. 252 + * - 0x100 253 + - \_\_le32 254 + - s\_default\_mount\_opts 255 + - Default mount options. See the super_mountopts_ table for more info. 256 + * - 0x104 257 + - \_\_le32 258 + - s\_first\_meta\_bg 259 + - First metablock block group, if the meta\_bg feature is enabled. 260 + * - 0x108 261 + - \_\_le32 262 + - s\_mkfs\_time 263 + - When the filesystem was created, in seconds since the epoch. 264 + * - 0x10C 265 + - \_\_le32 266 + - s\_jnl\_blocks[17] 267 + - Backup copy of the journal inode's ``i_block[]`` array in the first 15 268 + elements and i\_size\_high and i\_size in the 16th and 17th elements, 269 + respectively. 270 + * - 271 + - 272 + - 273 + - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. 274 + * - 0x150 275 + - \_\_le32 276 + - s\_blocks\_count\_hi 277 + - High 32-bits of the block count. 278 + * - 0x154 279 + - \_\_le32 280 + - s\_r\_blocks\_count\_hi 281 + - High 32-bits of the reserved block count. 282 + * - 0x158 283 + - \_\_le32 284 + - s\_free\_blocks\_count\_hi 285 + - High 32-bits of the free block count. 286 + * - 0x15C 287 + - \_\_le16 288 + - s\_min\_extra\_isize 289 + - All inodes have at least # bytes. 290 + * - 0x15E 291 + - \_\_le16 292 + - s\_want\_extra\_isize 293 + - New inodes should reserve # bytes. 294 + * - 0x160 295 + - \_\_le32 296 + - s\_flags 297 + - Miscellaneous flags. See the super_flags_ table for more info. 298 + * - 0x164 299 + - \_\_le16 300 + - s\_raid\_stride 301 + - RAID stride. This is the number of logical blocks read from or written 302 + to the disk before moving to the next disk. This affects the placement 303 + of filesystem metadata, which will hopefully make RAID storage faster. 304 + * - 0x166 305 + - \_\_le16 306 + - s\_mmp\_interval 307 + - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, 308 + MMP is a mechanism to record in the superblock which host and device 309 + have mounted the filesystem, in order to prevent multiple mounts. This 310 + feature does not seem to be implemented... 311 + * - 0x168 312 + - \_\_le64 313 + - s\_mmp\_block 314 + - Block # for multi-mount protection data. 315 + * - 0x170 316 + - \_\_le32 317 + - s\_raid\_stripe\_width 318 + - RAID stripe width. This is the number of logical blocks read from or 319 + written to the disk before coming back to the current disk. This is used 320 + by the block allocator to try to reduce the number of read-modify-write 321 + operations in a RAID5/6. 322 + * - 0x174 323 + - \_\_u8 324 + - s\_log\_groups\_per\_flex 325 + - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. 326 + * - 0x175 327 + - \_\_u8 328 + - s\_checksum\_type 329 + - Metadata checksum algorithm type. The only valid value is 1 (crc32c). 330 + * - 0x176 331 + - \_\_le16 332 + - s\_reserved\_pad 333 + - 334 + * - 0x178 335 + - \_\_le64 336 + - s\_kbytes\_written 337 + - Number of KiB written to this filesystem over its lifetime. 338 + * - 0x180 339 + - \_\_le32 340 + - s\_snapshot\_inum 341 + - inode number of active snapshot. (Not used in e2fsprogs/Linux.) 342 + * - 0x184 343 + - \_\_le32 344 + - s\_snapshot\_id 345 + - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) 346 + * - 0x188 347 + - \_\_le64 348 + - s\_snapshot\_r\_blocks\_count 349 + - Number of blocks reserved for active snapshot's future use. (Not used in 350 + e2fsprogs/Linux.) 351 + * - 0x190 352 + - \_\_le32 353 + - s\_snapshot\_list 354 + - inode number of the head of the on-disk snapshot list. (Not used in 355 + e2fsprogs/Linux.) 356 + * - 0x194 357 + - \_\_le32 358 + - s\_error\_count 359 + - Number of errors seen. 360 + * - 0x198 361 + - \_\_le32 362 + - s\_first\_error\_time 363 + - First time an error happened, in seconds since the epoch. 364 + * - 0x19C 365 + - \_\_le32 366 + - s\_first\_error\_ino 367 + - inode involved in first error. 368 + * - 0x1A0 369 + - \_\_le64 370 + - s\_first\_error\_block 371 + - Number of block involved of first error. 372 + * - 0x1A8 373 + - \_\_u8 374 + - s\_first\_error\_func[32] 375 + - Name of function where the error happened. 376 + * - 0x1C8 377 + - \_\_le32 378 + - s\_first\_error\_line 379 + - Line number where error happened. 380 + * - 0x1CC 381 + - \_\_le32 382 + - s\_last\_error\_time 383 + - Time of most recent error, in seconds since the epoch. 384 + * - 0x1D0 385 + - \_\_le32 386 + - s\_last\_error\_ino 387 + - inode involved in most recent error. 388 + * - 0x1D4 389 + - \_\_le32 390 + - s\_last\_error\_line 391 + - Line number where most recent error happened. 392 + * - 0x1D8 393 + - \_\_le64 394 + - s\_last\_error\_block 395 + - Number of block involved in most recent error. 396 + * - 0x1E0 397 + - \_\_u8 398 + - s\_last\_error\_func[32] 399 + - Name of function where the most recent error happened. 400 + * - 0x200 401 + - \_\_u8 402 + - s\_mount\_opts[64] 403 + - ASCIIZ string of mount options. 404 + * - 0x240 405 + - \_\_le32 406 + - s\_usr\_quota\_inum 407 + - Inode number of user `quota <quota>`__ file. 408 + * - 0x244 409 + - \_\_le32 410 + - s\_grp\_quota\_inum 411 + - Inode number of group `quota <quota>`__ file. 412 + * - 0x248 413 + - \_\_le32 414 + - s\_overhead\_blocks 415 + - Overhead blocks/clusters in fs. (Huh? This field is always zero, which 416 + means that the kernel calculates it dynamically.) 417 + * - 0x24C 418 + - \_\_le32 419 + - s\_backup\_bgs[2] 420 + - Block groups containing superblock backups (if sparse\_super2) 421 + * - 0x254 422 + - \_\_u8 423 + - s\_encrypt\_algos[4] 424 + - Encryption algorithms in use. There can be up to four algorithms in use 425 + at any time; valid algorithm codes are given in the super_encrypt_ table 426 + below. 427 + * - 0x258 428 + - \_\_u8 429 + - s\_encrypt\_pw\_salt[16] 430 + - Salt for the string2key algorithm for encryption. 431 + * - 0x268 432 + - \_\_le32 433 + - s\_lpf\_ino 434 + - Inode number of lost+found 435 + * - 0x26C 436 + - \_\_le32 437 + - s\_prj\_quota\_inum 438 + - Inode that tracks project quotas. 439 + * - 0x270 440 + - \_\_le32 441 + - s\_checksum\_seed 442 + - Checksum seed used for metadata\_csum calculations. This value is 443 + crc32c(~0, $orig\_fs\_uuid). 444 + * - 0x274 445 + - \_\_u8 446 + - s\_wtime_hi 447 + - Upper 8 bits of the s_wtime field. 448 + * - 0x275 449 + - \_\_u8 450 + - s\_wtime_hi 451 + - Upper 8 bits of the s_mtime field. 452 + * - 0x276 453 + - \_\_u8 454 + - s\_mkfs_time_hi 455 + - Upper 8 bits of the s_mkfs_time field. 456 + * - 0x277 457 + - \_\_u8 458 + - s\_lastcheck_hi 459 + - Upper 8 bits of the s_lastcheck_hi field. 460 + * - 0x278 461 + - \_\_u8 462 + - s\_first_error_time_hi 463 + - Upper 8 bits of the s_first_error_time_hi field. 464 + * - 0x279 465 + - \_\_u8 466 + - s\_last_error_time_hi 467 + - Upper 8 bits of the s_last_error_time_hi field. 468 + * - 0x27A 469 + - \_\_u8[2] 470 + - s\_pad 471 + - Zero padding. 472 + * - 0x27C 473 + - \_\_le32 474 + - s\_reserved[96] 475 + - Padding to the end of the block. 476 + * - 0x3FC 477 + - \_\_le32 478 + - s\_checksum 479 + - Superblock checksum. 480 + 481 + .. _super_state: 482 + 483 + The superblock state is some combination of the following: 484 + 485 + .. list-table:: 486 + :widths: 1 79 487 + :header-rows: 1 488 + 489 + * - Value 490 + - Description 491 + * - 0x0001 492 + - Cleanly umounted 493 + * - 0x0002 494 + - Errors detected 495 + * - 0x0004 496 + - Orphans being recovered 497 + 498 + .. _super_errors: 499 + 500 + The superblock error policy is one of the following: 501 + 502 + .. list-table:: 503 + :widths: 1 79 504 + :header-rows: 1 505 + 506 + * - Value 507 + - Description 508 + * - 1 509 + - Continue 510 + * - 2 511 + - Remount read-only 512 + * - 3 513 + - Panic 514 + 515 + .. _super_creator: 516 + 517 + The filesystem creator is one of the following: 518 + 519 + .. list-table:: 520 + :widths: 1 79 521 + :header-rows: 1 522 + 523 + * - Value 524 + - Description 525 + * - 0 526 + - Linux 527 + * - 1 528 + - Hurd 529 + * - 2 530 + - Masix 531 + * - 3 532 + - FreeBSD 533 + * - 4 534 + - Lites 535 + 536 + .. _super_revision: 537 + 538 + The superblock revision is one of the following: 539 + 540 + .. list-table:: 541 + :widths: 1 79 542 + :header-rows: 1 543 + 544 + * - Value 545 + - Description 546 + * - 0 547 + - Original format 548 + * - 1 549 + - v2 format w/ dynamic inode sizes 550 + 551 + Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem. 552 + 553 + .. _super_compat: 554 + 555 + The superblock compatible features field is a combination of any of the 556 + following: 557 + 558 + .. list-table:: 559 + :widths: 1 79 560 + :header-rows: 1 561 + 562 + * - Value 563 + - Description 564 + * - 0x1 565 + - Directory preallocation (COMPAT\_DIR\_PREALLOC). 566 + * - 0x2 567 + - “imagic inodes”. Not clear from the code what this does 568 + (COMPAT\_IMAGIC\_INODES). 569 + * - 0x4 570 + - Has a journal (COMPAT\_HAS\_JOURNAL). 571 + * - 0x8 572 + - Supports extended attributes (COMPAT\_EXT\_ATTR). 573 + * - 0x10 574 + - Has reserved GDT blocks for filesystem expansion 575 + (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. 576 + * - 0x20 577 + - Has directory indices (COMPAT\_DIR\_INDEX). 578 + * - 0x40 579 + - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized 580 + block groups? (COMPAT\_LAZY\_BG) 581 + * - 0x80 582 + - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). 583 + * - 0x100 584 + - “Exclude bitmap”. Seems to be used to indicate the presence of 585 + snapshot-related exclude bitmaps? Not defined in kernel or used in 586 + e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). 587 + * - 0x200 588 + - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs 589 + points to the two block groups that contain backup superblocks 590 + (COMPAT\_SPARSE\_SUPER2). 591 + 592 + .. _super_incompat: 593 + 594 + The superblock incompatible features field is a combination of any of the 595 + following: 596 + 597 + .. list-table:: 598 + :widths: 1 79 599 + :header-rows: 1 600 + 601 + * - Value 602 + - Description 603 + * - 0x1 604 + - Compression (INCOMPAT\_COMPRESSION). 605 + * - 0x2 606 + - Directory entries record the file type. See ext4\_dir\_entry\_2 below 607 + (INCOMPAT\_FILETYPE). 608 + * - 0x4 609 + - Filesystem needs recovery (INCOMPAT\_RECOVER). 610 + * - 0x8 611 + - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). 612 + * - 0x10 613 + - Meta block groups. See the earlier discussion of this feature 614 + (INCOMPAT\_META\_BG). 615 + * - 0x40 616 + - Files in this filesystem use extents (INCOMPAT\_EXTENTS). 617 + * - 0x80 618 + - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). 619 + * - 0x100 620 + - Multiple mount protection. Not implemented (INCOMPAT\_MMP). 621 + * - 0x200 622 + - Flexible block groups. See the earlier discussion of this feature 623 + (INCOMPAT\_FLEX\_BG). 624 + * - 0x400 625 + - Inodes can be used to store large extended attribute values 626 + (INCOMPAT\_EA\_INODE). 627 + * - 0x1000 628 + - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) 629 + * - 0x2000 630 + - Metadata checksum seed is stored in the superblock. This feature enables 631 + the administrator to change the UUID of a metadata\_csum filesystem 632 + while the filesystem is mounted; without it, the checksum definition 633 + requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). 634 + * - 0x4000 635 + - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to 636 + this feature, directories could not be larger than 4GiB and could not 637 + have an htree more than 2 levels deep. If this feature is enabled, 638 + directories can be larger than 4GiB and have a maximum htree depth of 3. 639 + * - 0x8000 640 + - Data in inode (INCOMPAT\_INLINE\_DATA). 641 + * - 0x10000 642 + - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). 643 + 644 + .. _super_rocompat: 645 + 646 + The superblock read-only compatible features field is a combination of any of 647 + the following: 648 + 649 + .. list-table:: 650 + :widths: 1 79 651 + :header-rows: 1 652 + 653 + * - Value 654 + - Description 655 + * - 0x1 656 + - Sparse superblocks. See the earlier discussion of this feature 657 + (RO\_COMPAT\_SPARSE\_SUPER). 658 + * - 0x2 659 + - This filesystem has been used to store a file greater than 2GiB 660 + (RO\_COMPAT\_LARGE\_FILE). 661 + * - 0x4 662 + - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). 663 + * - 0x8 664 + - This filesystem has files whose sizes are represented in units of 665 + logical blocks, not 512-byte sectors. This implies a very large file 666 + indeed! (RO\_COMPAT\_HUGE\_FILE) 667 + * - 0x10 668 + - Group descriptors have checksums. In addition to detecting corruption, 669 + this is useful for lazy formatting with uninitialized groups 670 + (RO\_COMPAT\_GDT\_CSUM). 671 + * - 0x20 672 + - Indicates that the old ext3 32,000 subdirectory limit no longer applies 673 + (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 674 + if it is incremented past 64,999. 675 + * - 0x40 676 + - Indicates that large inodes exist on this filesystem 677 + (RO\_COMPAT\_EXTRA\_ISIZE). 678 + * - 0x80 679 + - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). 680 + * - 0x100 681 + - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA). 682 + * - 0x200 683 + - This filesystem supports “bigalloc”, which means that file extents are 684 + tracked in units of clusters (of blocks) instead of blocks 685 + (RO\_COMPAT\_BIGALLOC). 686 + * - 0x400 687 + - This filesystem supports metadata checksumming. 688 + (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though 689 + GDT\_CSUM must not be set) 690 + * - 0x800 691 + - Filesystem supports replicas. This feature is neither in the kernel nor 692 + e2fsprogs. (RO\_COMPAT\_REPLICA) 693 + * - 0x1000 694 + - Read-only filesystem image; the kernel will not mount this image 695 + read-write and most tools will refuse to write to the image. 696 + (RO\_COMPAT\_READONLY) 697 + * - 0x2000 698 + - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) 699 + 700 + .. _super_def_hash: 701 + 702 + The ``s_def_hash_version`` field is one of the following: 703 + 704 + .. list-table:: 705 + :widths: 1 79 706 + :header-rows: 1 707 + 708 + * - Value 709 + - Description 710 + * - 0x0 711 + - Legacy. 712 + * - 0x1 713 + - Half MD4. 714 + * - 0x2 715 + - Tea. 716 + * - 0x3 717 + - Legacy, unsigned. 718 + * - 0x4 719 + - Half MD4, unsigned. 720 + * - 0x5 721 + - Tea, unsigned. 722 + 723 + .. _super_mountopts: 724 + 725 + The ``s_default_mount_opts`` field is any combination of the following: 726 + 727 + .. list-table:: 728 + :widths: 1 79 729 + :header-rows: 1 730 + 731 + * - Value 732 + - Description 733 + * - 0x0001 734 + - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) 735 + * - 0x0002 736 + - New files take the gid of the containing directory (instead of the fsgid 737 + of the current process). (EXT4\_DEFM\_BSDGROUPS) 738 + * - 0x0004 739 + - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) 740 + * - 0x0008 741 + - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) 742 + * - 0x0010 743 + - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) 744 + * - 0x0020 745 + - All data and metadata are commited to the journal. 746 + (EXT4\_DEFM\_JMODE\_DATA) 747 + * - 0x0040 748 + - All data are flushed to the disk before metadata are committed to the 749 + journal. (EXT4\_DEFM\_JMODE\_ORDERED) 750 + * - 0x0060 751 + - Data ordering is not preserved; data may be written after the metadata 752 + has been written. (EXT4\_DEFM\_JMODE\_WBACK) 753 + * - 0x0100 754 + - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) 755 + * - 0x0200 756 + - Track which blocks in a filesystem are metadata and therefore should not 757 + be used as data blocks. This option will be enabled by default on 3.18, 758 + hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) 759 + * - 0x0400 760 + - Enable DISCARD support, where the storage device is told about blocks 761 + becoming unused. (EXT4\_DEFM\_DISCARD) 762 + * - 0x0800 763 + - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) 764 + 765 + .. _super_flags: 766 + 767 + The ``s_flags`` field is any combination of the following: 768 + 769 + .. list-table:: 770 + :widths: 1 79 771 + :header-rows: 1 772 + 773 + * - Value 774 + - Description 775 + * - 0x0001 776 + - Signed directory hash in use. 777 + * - 0x0002 778 + - Unsigned directory hash in use. 779 + * - 0x0004 780 + - To test development code. 781 + 782 + .. _super_encrypt: 783 + 784 + The ``s_encrypt_algos`` list can contain any of the following: 785 + 786 + .. list-table:: 787 + :widths: 1 79 788 + :header-rows: 1 789 + 790 + * - Value 791 + - Description 792 + * - 0 793 + - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). 794 + * - 1 795 + - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). 796 + * - 2 797 + - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). 798 + * - 3 799 + - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). 800 + 801 + Total size of the superblock is 1024 bytes.

+11

Documentation/index.rst

··· 102 102 103 103 sh/index 104 104 105 + Filesystem Documentation 106 + ------------------------ 107 + 108 + The documentation in this section are provided by specific filesystem 109 + subprojects. 110 + 111 + .. toctree:: 112 + :maxdepth: 2 113 + 114 + filesystems/ext4/index 115 + 105 116 Korean translations 106 117 ------------------- 107 118

+9 -1

fs/dax.c

··· 566 566 if (index >= end) 567 567 break; 568 568 569 - if (!radix_tree_exceptional_entry(pvec_ent)) 569 + if (WARN_ON_ONCE( 570 + !radix_tree_exceptional_entry(pvec_ent))) 570 571 continue; 571 572 572 573 xa_lock_irq(&mapping->i_pages); ··· 579 578 if (page) 580 579 break; 581 580 } 581 + 582 + /* 583 + * We don't expect normal struct page entries to exist in our 584 + * tree, but we keep these pagevec calls so that this code is 585 + * consistent with the common pattern for handling pagevecs 586 + * throughout the kernel. 587 + */ 582 588 pagevec_remove_exceptionals(&pvec); 583 589 pagevec_release(&pvec); 584 590 index++;

+3 -3

fs/ext4/balloc.c

··· 426 426 } 427 427 bh = sb_getblk(sb, bitmap_blk); 428 428 if (unlikely(!bh)) { 429 - ext4_error(sb, "Cannot get buffer for block bitmap - " 430 - "block_group = %u, block_bitmap = %llu", 431 - block_group, bitmap_blk); 429 + ext4_warning(sb, "Cannot get buffer for block bitmap - " 430 + "block_group = %u, block_bitmap = %llu", 431 + block_group, bitmap_blk); 432 432 return ERR_PTR(-ENOMEM); 433 433 } 434 434

+18 -14

fs/ext4/ext4.h

··· 789 789 * affected filesystem before 2242. 790 790 */ 791 791 792 - static inline __le32 ext4_encode_extra_time(struct timespec *time) 792 + static inline __le32 ext4_encode_extra_time(struct timespec64 *time) 793 793 { 794 - u32 extra = sizeof(time->tv_sec) > 4 ? 795 - ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; 794 + u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; 796 795 return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); 797 796 } 798 797 799 - static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 798 + static inline void ext4_decode_extra_time(struct timespec64 *time, 799 + __le32 extra) 800 800 { 801 - if (unlikely(sizeof(time->tv_sec) > 4 && 802 - (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { 801 + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { 803 802 804 803 #if 1 805 804 /* Handle legacy encoding of pre-1970 dates with epoch ··· 820 821 do { \ 821 822 (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ 822 823 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ 823 - struct timespec ts = timespec64_to_timespec((inode)->xtime); \ 824 824 (raw_inode)->xtime ## _extra = \ 825 - ext4_encode_extra_time(&ts); \ 825 + ext4_encode_extra_time(&(inode)->xtime); \ 826 826 } \ 827 827 } while (0) 828 828 ··· 838 840 do { \ 839 841 (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ 840 842 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ 841 - struct timespec ts = timespec64_to_timespec((inode)->xtime); \ 842 - ext4_decode_extra_time(&ts, \ 843 + ext4_decode_extra_time(&(inode)->xtime, \ 843 844 raw_inode->xtime ## _extra); \ 844 - (inode)->xtime = timespec_to_timespec64(ts); \ 845 845 } \ 846 846 else \ 847 847 (inode)->xtime.tv_nsec = 0; \ ··· 989 993 990 994 /* 991 995 * File creation time. Its function is same as that of 992 - * struct timespec i_{a,c,m}time in the generic inode. 996 + * struct timespec64 i_{a,c,m}time in the generic inode. 993 997 */ 994 - struct timespec i_crtime; 998 + struct timespec64 i_crtime; 995 999 996 1000 /* mballoc */ 997 1001 struct list_head i_prealloc_list; ··· 1295 1299 __le32 s_lpf_ino; /* Location of the lost+found inode */ 1296 1300 __le32 s_prj_quota_inum; /* inode for tracking project quota */ 1297 1301 __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ 1298 - __le32 s_reserved[98]; /* Padding to the end of the block */ 1302 + __u8 s_wtime_hi; 1303 + __u8 s_mtime_hi; 1304 + __u8 s_mkfs_time_hi; 1305 + __u8 s_lastcheck_hi; 1306 + __u8 s_first_error_time_hi; 1307 + __u8 s_last_error_time_hi; 1308 + __u8 s_pad[2]; 1309 + __le32 s_reserved[96]; /* Padding to the end of the block */ 1299 1310 __le32 s_checksum; /* crc32c(superblock) */ 1300 1311 }; 1301 1312 ··· 2459 2456 extern int ext4_inode_attach_jinode(struct inode *inode); 2460 2457 extern int ext4_can_truncate(struct inode *inode); 2461 2458 extern int ext4_truncate(struct inode *); 2459 + extern int ext4_break_layouts(struct inode *); 2462 2460 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); 2463 2461 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2464 2462 extern void ext4_set_inode_flags(struct inode *);

+17

fs/ext4/extents.c

··· 4826 4826 * released from page cache. 4827 4827 */ 4828 4828 down_write(&EXT4_I(inode)->i_mmap_sem); 4829 + 4830 + ret = ext4_break_layouts(inode); 4831 + if (ret) { 4832 + up_write(&EXT4_I(inode)->i_mmap_sem); 4833 + goto out_mutex; 4834 + } 4835 + 4829 4836 ret = ext4_update_disksize_before_punch(inode, offset, len); 4830 4837 if (ret) { 4831 4838 up_write(&EXT4_I(inode)->i_mmap_sem); ··· 5506 5499 * page cache. 5507 5500 */ 5508 5501 down_write(&EXT4_I(inode)->i_mmap_sem); 5502 + 5503 + ret = ext4_break_layouts(inode); 5504 + if (ret) 5505 + goto out_mmap; 5506 + 5509 5507 /* 5510 5508 * Need to round down offset to be aligned with page size boundary 5511 5509 * for page size > block size. ··· 5659 5647 * page cache. 5660 5648 */ 5661 5649 down_write(&EXT4_I(inode)->i_mmap_sem); 5650 + 5651 + ret = ext4_break_layouts(inode); 5652 + if (ret) 5653 + goto out_mmap; 5654 + 5662 5655 /* 5663 5656 * Need to round down to align start offset to page size boundary 5664 5657 * for page size > block size.

+4 -4

fs/ext4/ialloc.c

··· 138 138 } 139 139 bh = sb_getblk(sb, bitmap_blk); 140 140 if (unlikely(!bh)) { 141 - ext4_error(sb, "Cannot read inode bitmap - " 142 - "block_group = %u, inode_bitmap = %llu", 143 - block_group, bitmap_blk); 141 + ext4_warning(sb, "Cannot read inode bitmap - " 142 + "block_group = %u, inode_bitmap = %llu", 143 + block_group, bitmap_blk); 144 144 return ERR_PTR(-ENOMEM); 145 145 } 146 146 if (bitmap_uptodate(bh)) ··· 1086 1086 /* This is the optimal IO size (for stat), not the fs block size */ 1087 1087 inode->i_blocks = 0; 1088 1088 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 1089 - ei->i_crtime = timespec64_to_timespec(inode->i_mtime); 1089 + ei->i_crtime = inode->i_mtime; 1090 1090 1091 1091 memset(ei->i_data, 0, sizeof(ei->i_data)); 1092 1092 ei->i_dir_start_lookup = 0;

+54 -11

fs/ext4/inode.c

··· 317 317 * (Well, we could do this if we need to, but heck - it works) 318 318 */ 319 319 ext4_orphan_del(handle, inode); 320 - EXT4_I(inode)->i_dtime = get_seconds(); 320 + EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); 321 321 322 322 /* 323 323 * One subtle ordering requirement: if anything has gone wrong ··· 4191 4191 return 0; 4192 4192 } 4193 4193 4194 + static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock) 4195 + { 4196 + *did_unlock = true; 4197 + up_write(&ei->i_mmap_sem); 4198 + schedule(); 4199 + down_write(&ei->i_mmap_sem); 4200 + } 4201 + 4202 + int ext4_break_layouts(struct inode *inode) 4203 + { 4204 + struct ext4_inode_info *ei = EXT4_I(inode); 4205 + struct page *page; 4206 + bool retry; 4207 + int error; 4208 + 4209 + if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) 4210 + return -EINVAL; 4211 + 4212 + do { 4213 + retry = false; 4214 + page = dax_layout_busy_page(inode->i_mapping); 4215 + if (!page) 4216 + return 0; 4217 + 4218 + error = ___wait_var_event(&page->_refcount, 4219 + atomic_read(&page->_refcount) == 1, 4220 + TASK_INTERRUPTIBLE, 0, 0, 4221 + ext4_wait_dax_page(ei, &retry)); 4222 + } while (error == 0 && retry); 4223 + 4224 + return error; 4225 + } 4226 + 4194 4227 /* 4195 4228 * ext4_punch_hole: punches a hole in a file by releasing the blocks 4196 4229 * associated with the given offset and length ··· 4297 4264 * page cache. 4298 4265 */ 4299 4266 down_write(&EXT4_I(inode)->i_mmap_sem); 4267 + 4268 + ret = ext4_break_layouts(inode); 4269 + if (ret) 4270 + goto out_dio; 4271 + 4300 4272 first_block_offset = round_up(offset, sb->s_blocksize); 4301 4273 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 4302 4274 ··· 4982 4944 ret = -EFSCORRUPTED; 4983 4945 goto bad_inode; 4984 4946 } else if (!ext4_has_inline_data(inode)) { 4985 - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4986 - if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4987 - (S_ISLNK(inode->i_mode) && 4988 - !ext4_inode_is_fast_symlink(inode)))) 4989 - /* Validate extent which is part of inode */ 4947 + /* validate the block references in the inode */ 4948 + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4949 + (S_ISLNK(inode->i_mode) && 4950 + !ext4_inode_is_fast_symlink(inode))) { 4951 + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4990 4952 ret = ext4_ext_check_inode(inode); 4991 - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4992 - (S_ISLNK(inode->i_mode) && 4993 - !ext4_inode_is_fast_symlink(inode))) { 4994 - /* Validate block references which are part of inode */ 4995 - ret = ext4_ind_check_inode(inode); 4953 + else 4954 + ret = ext4_ind_check_inode(inode); 4996 4955 } 4997 4956 } 4998 4957 if (ret) ··· 5588 5553 ext4_wait_for_tail_page_commit(inode); 5589 5554 } 5590 5555 down_write(&EXT4_I(inode)->i_mmap_sem); 5556 + 5557 + rc = ext4_break_layouts(inode); 5558 + if (rc) { 5559 + up_write(&EXT4_I(inode)->i_mmap_sem); 5560 + error = rc; 5561 + goto err_out; 5562 + } 5563 + 5591 5564 /* 5592 5565 * Truncate pagecache after we've waited for commit 5593 5566 * in data=journal mode to make pages freeable.

+4 -3

fs/ext4/mballoc.c

··· 14 14 #include <linux/log2.h> 15 15 #include <linux/module.h> 16 16 #include <linux/slab.h> 17 + #include <linux/nospec.h> 17 18 #include <linux/backing-dev.h> 18 19 #include <trace/events/ext4.h> 19 20 ··· 2141 2140 * This should tell if fe_len is exactly power of 2 2142 2141 */ 2143 2142 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2144 - ac->ac_2order = i - 1; 2143 + ac->ac_2order = array_index_nospec(i - 1, 2144 + sb->s_blocksize_bits + 2); 2145 2145 } 2146 2146 2147 2147 /* if stream allocation is enabled, use global goal */ ··· 3801 3799 ext4_group_t group; 3802 3800 ext4_grpblk_t bit; 3803 3801 unsigned long long grp_blk_start; 3804 - int err = 0; 3805 3802 int free = 0; 3806 3803 3807 3804 BUG_ON(pa->pa_deleted == 0); ··· 3841 3840 } 3842 3841 atomic_add(free, &sbi->s_mb_discarded); 3843 3842 3844 - return err; 3843 + return 0; 3845 3844 } 3846 3845 3847 3846 static noinline_for_stack int

+3 -3

fs/ext4/mmp.c

··· 147 147 148 148 mmp_block = le64_to_cpu(es->s_mmp_block); 149 149 mmp = (struct mmp_struct *)(bh->b_data); 150 - mmp->mmp_time = cpu_to_le64(get_seconds()); 150 + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); 151 151 /* 152 152 * Start with the higher mmp_check_interval and reduce it if 153 153 * the MMP block is being updated on time. ··· 165 165 seq = 1; 166 166 167 167 mmp->mmp_seq = cpu_to_le32(seq); 168 - mmp->mmp_time = cpu_to_le64(get_seconds()); 168 + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); 169 169 last_update_time = jiffies; 170 170 171 171 retval = write_mmp_block(sb, bh); ··· 241 241 * Unmount seems to be clean. 242 242 */ 243 243 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); 244 - mmp->mmp_time = cpu_to_le64(get_seconds()); 244 + mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); 245 245 246 246 retval = write_mmp_block(sb, bh); 247 247

+1 -3

fs/ext4/move_extent.c

··· 134 134 mapping[0] = inode1->i_mapping; 135 135 mapping[1] = inode2->i_mapping; 136 136 } else { 137 - pgoff_t tmp = index1; 138 - index1 = index2; 139 - index2 = tmp; 137 + swap(index1, index2); 140 138 mapping[0] = inode2->i_mapping; 141 139 mapping[1] = inode1->i_mapping; 142 140 }

+1

fs/ext4/namei.c

··· 1398 1398 goto cleanup_and_exit; 1399 1399 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1400 1400 "falling back\n")); 1401 + ret = NULL; 1401 1402 } 1402 1403 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); 1403 1404 if (!nblocks) {

+47 -23

fs/ext4/super.c

··· 312 312 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 313 313 } 314 314 315 + static void __ext4_update_tstamp(__le32 *lo, __u8 *hi) 316 + { 317 + time64_t now = ktime_get_real_seconds(); 318 + 319 + now = clamp_val(now, 0, (1ull << 40) - 1); 320 + 321 + *lo = cpu_to_le32(lower_32_bits(now)); 322 + *hi = upper_32_bits(now); 323 + } 324 + 325 + static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) 326 + { 327 + return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); 328 + } 329 + #define ext4_update_tstamp(es, tstamp) \ 330 + __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 331 + #define ext4_get_tstamp(es, tstamp) \ 332 + __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 315 333 316 334 static void __save_error_info(struct super_block *sb, const char *func, 317 335 unsigned int line) ··· 340 322 if (bdev_read_only(sb->s_bdev)) 341 323 return; 342 324 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 343 - es->s_last_error_time = cpu_to_le32(get_seconds()); 325 + ext4_update_tstamp(es, s_last_error_time); 344 326 strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); 345 327 es->s_last_error_line = cpu_to_le32(line); 346 328 if (!es->s_first_error_time) { 347 329 es->s_first_error_time = es->s_last_error_time; 330 + es->s_first_error_time_hi = es->s_last_error_time_hi; 348 331 strncpy(es->s_first_error_func, func, 349 332 sizeof(es->s_first_error_func)); 350 333 es->s_first_error_line = cpu_to_le32(line); ··· 795 776 struct ext4_sb_info *sbi = EXT4_SB(sb); 796 777 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 797 778 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 779 + int ret; 798 780 799 - if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && 800 - !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { 801 - percpu_counter_sub(&sbi->s_freeclusters_counter, 802 - grp->bb_free); 803 - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 804 - &grp->bb_state); 781 + if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { 782 + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 783 + &grp->bb_state); 784 + if (!ret) 785 + percpu_counter_sub(&sbi->s_freeclusters_counter, 786 + grp->bb_free); 805 787 } 806 788 807 - if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && 808 - !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { 809 - if (gdp) { 789 + if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { 790 + ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, 791 + &grp->bb_state); 792 + if (!ret && gdp) { 810 793 int count; 811 794 812 795 count = ext4_free_inodes_count(sb, gdp); 813 796 percpu_counter_sub(&sbi->s_freeinodes_counter, 814 797 count); 815 798 } 816 - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, 817 - &grp->bb_state); 818 799 } 819 800 } 820 801 ··· 2193 2174 "warning: maximal mount count reached, " 2194 2175 "running e2fsck is recommended"); 2195 2176 else if (le32_to_cpu(es->s_checkinterval) && 2196 - (le32_to_cpu(es->s_lastcheck) + 2197 - le32_to_cpu(es->s_checkinterval) <= get_seconds())) 2177 + (ext4_get_tstamp(es, s_lastcheck) + 2178 + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) 2198 2179 ext4_msg(sb, KERN_WARNING, 2199 2180 "warning: checktime reached, " 2200 2181 "running e2fsck is recommended"); ··· 2203 2184 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 2204 2185 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 2205 2186 le16_add_cpu(&es->s_mnt_count, 1); 2206 - es->s_mtime = cpu_to_le32(get_seconds()); 2187 + ext4_update_tstamp(es, s_mtime); 2207 2188 ext4_update_dynamic_rev(sb); 2208 2189 if (sbi->s_journal) 2209 2190 ext4_set_feature_journal_needs_recovery(sb); ··· 2894 2875 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", 2895 2876 le32_to_cpu(es->s_error_count)); 2896 2877 if (es->s_first_error_time) { 2897 - printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", 2898 - sb->s_id, le32_to_cpu(es->s_first_error_time), 2878 + printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", 2879 + sb->s_id, 2880 + ext4_get_tstamp(es, s_first_error_time), 2899 2881 (int) sizeof(es->s_first_error_func), 2900 2882 es->s_first_error_func, 2901 2883 le32_to_cpu(es->s_first_error_line)); ··· 2909 2889 printk(KERN_CONT "\n"); 2910 2890 } 2911 2891 if (es->s_last_error_time) { 2912 - printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", 2913 - sb->s_id, le32_to_cpu(es->s_last_error_time), 2892 + printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", 2893 + sb->s_id, 2894 + ext4_get_tstamp(es, s_last_error_time), 2914 2895 (int) sizeof(es->s_last_error_func), 2915 2896 es->s_last_error_func, 2916 2897 le32_to_cpu(es->s_last_error_line)); ··· 4834 4813 * to complain and force a full file system check. 4835 4814 */ 4836 4815 if (!(sb->s_flags & SB_RDONLY)) 4837 - es->s_wtime = cpu_to_le32(get_seconds()); 4816 + ext4_update_tstamp(es, s_wtime); 4838 4817 if (sb->s_bdev->bd_part) 4839 4818 es->s_kbytes_written = 4840 4819 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ··· 5100 5079 int i, j; 5101 5080 #endif 5102 5081 char *orig_data = kstrdup(data, GFP_KERNEL); 5082 + 5083 + if (data && !orig_data) 5084 + return -ENOMEM; 5103 5085 5104 5086 /* Store the original options */ 5105 5087 old_sb_flags = sb->s_flags; ··· 5689 5665 DQUOT_USAGE_ENABLED | 5690 5666 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); 5691 5667 if (err) { 5692 - for (type--; type >= 0; type--) 5693 - dquot_quota_off(sb, type); 5694 - 5695 5668 ext4_warning(sb, 5696 5669 "Failed to enable quota tracking " 5697 5670 "(type=%d, err=%d). Please run " 5698 5671 "e2fsck to fix.", type, err); 5672 + for (type--; type >= 0; type--) 5673 + dquot_quota_off(sb, type); 5674 + 5699 5675 return err; 5700 5676 } 5701 5677 }

+27 -5

fs/ext4/sysfs.c

··· 25 25 attr_reserved_clusters, 26 26 attr_inode_readahead, 27 27 attr_trigger_test_error, 28 + attr_first_error_time, 29 + attr_last_error_time, 28 30 attr_feature, 29 31 attr_pointer_ui, 30 32 attr_pointer_atomic, ··· 184 182 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 185 183 EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 186 184 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); 187 - EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); 188 - EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); 185 + EXT4_ATTR(first_error_time, 0444, first_error_time); 186 + EXT4_ATTR(last_error_time, 0444, last_error_time); 189 187 190 188 static unsigned int old_bump_val = 128; 191 189 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); ··· 251 249 return NULL; 252 250 } 253 251 252 + static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) 253 + { 254 + return snprintf(buf, PAGE_SIZE, "%lld", 255 + ((time64_t)hi << 32) + le32_to_cpu(lo)); 256 + } 257 + 258 + #define print_tstamp(buf, es, tstamp) \ 259 + __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) 260 + 254 261 static ssize_t ext4_attr_show(struct kobject *kobj, 255 262 struct attribute *attr, char *buf) 256 263 { ··· 285 274 case attr_pointer_ui: 286 275 if (!ptr) 287 276 return 0; 288 - return snprintf(buf, PAGE_SIZE, "%u\n", 289 - *((unsigned int *) ptr)); 277 + if (a->attr_ptr == ptr_ext4_super_block_offset) 278 + return snprintf(buf, PAGE_SIZE, "%u\n", 279 + le32_to_cpup(ptr)); 280 + else 281 + return snprintf(buf, PAGE_SIZE, "%u\n", 282 + *((unsigned int *) ptr)); 290 283 case attr_pointer_atomic: 291 284 if (!ptr) 292 285 return 0; ··· 298 283 atomic_read((atomic_t *) ptr)); 299 284 case attr_feature: 300 285 return snprintf(buf, PAGE_SIZE, "supported\n"); 286 + case attr_first_error_time: 287 + return print_tstamp(buf, sbi->s_es, s_first_error_time); 288 + case attr_last_error_time: 289 + return print_tstamp(buf, sbi->s_es, s_last_error_time); 301 290 } 302 291 303 292 return 0; ··· 327 308 ret = kstrtoul(skip_spaces(buf), 0, &t); 328 309 if (ret) 329 310 return ret; 330 - *((unsigned int *) ptr) = t; 311 + if (a->attr_ptr == ptr_ext4_super_block_offset) 312 + *((__le32 *) ptr) = cpu_to_le32(t); 313 + else 314 + *((unsigned int *) ptr) = t; 331 315 return len; 332 316 case attr_inode_readahead: 333 317 return inode_readahead_blks_store(sbi, buf, len);

+4

fs/ext4/truncate.h

··· 11 11 */ 12 12 static inline void ext4_truncate_failed_write(struct inode *inode) 13 13 { 14 + /* 15 + * We don't need to call ext4_break_layouts() because the blocks we 16 + * are truncating were never visible to userspace. 17 + */ 14 18 down_write(&EXT4_I(inode)->i_mmap_sem); 15 19 truncate_inode_pages(inode->i_mapping, inode->i_size); 16 20 ext4_truncate(inode);

+2

fs/ext4/xattr.c

··· 190 190 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); 191 191 if ((void *)next >= end) 192 192 return -EFSCORRUPTED; 193 + if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) 194 + return -EFSCORRUPTED; 193 195 e = next; 194 196 } 195 197

+2 -1

fs/jbd2/commit.c

··· 121 121 struct commit_header *tmp; 122 122 struct buffer_head *bh; 123 123 int ret; 124 - struct timespec64 now = current_kernel_time64(); 124 + struct timespec64 now; 125 125 126 126 *cbh = NULL; 127 127 ··· 134 134 return 1; 135 135 136 136 tmp = (struct commit_header *)bh->b_data; 137 + ktime_get_coarse_real_ts64(&now); 137 138 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 138 139 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 139 140

Configure Feed

Configure Feed