Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge patch series "iomap: fix zero padding data issue in concurrent append writes"

Long Li <leo.lilong@huawei.com> says:

This patch series fixes zero padding data issues in concurrent append
write scenarios. A detailed problem description and solution can be
found in patch 2. Patch 1 is introduced as preparation for the fix in
patch 2, eliminating the need to resample inode size for io_size
trimming and avoiding issues caused by inode size changes during
concurrent writeback and truncate operations.

* patches from https://lore.kernel.org/r/20241209114241.3725722-1-leo.lilong@huawei.com:
iomap: fix zero padding data issue in concurrent append writes
iomap: pass byte granular end position to iomap_add_to_ioend

Link: https://lore.kernel.org/r/20241209114241.3725722-1-leo.lilong@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>

+58 -10
+57 -9
fs/iomap/buffered-io.c
··· 1774 1774 */ 1775 1775 static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, 1776 1776 struct writeback_control *wbc, struct folio *folio, 1777 - struct inode *inode, loff_t pos, unsigned len) 1777 + struct inode *inode, loff_t pos, loff_t end_pos, 1778 + unsigned len) 1778 1779 { 1779 1780 struct iomap_folio_state *ifs = folio->private; 1780 1781 size_t poff = offset_in_folio(folio, pos); ··· 1794 1793 1795 1794 if (ifs) 1796 1795 atomic_add(len, &ifs->write_bytes_pending); 1796 + 1797 + /* 1798 + * Clamp io_offset and io_size to the incore EOF so that ondisk 1799 + * file size updates in the ioend completion are byte-accurate. 1800 + * This avoids recovering files with zeroed tail regions when 1801 + * writeback races with appending writes: 1802 + * 1803 + * Thread 1: Thread 2: 1804 + * ------------ ----------- 1805 + * write [A, A+B] 1806 + * update inode size to A+B 1807 + * submit I/O [A, A+BS] 1808 + * write [A+B, A+B+C] 1809 + * update inode size to A+B+C 1810 + * <I/O completes, updates disk size to min(A+B+C, A+BS)> 1811 + * <power failure> 1812 + * 1813 + * After reboot: 1814 + * 1) with A+B+C < A+BS, the file has zero padding in range 1815 + * [A+B, A+B+C] 1816 + * 1817 + * |< Block Size (BS) >| 1818 + * |DDDDDDDDDDDD0000000000000| 1819 + * ^ ^ ^ 1820 + * A A+B A+B+C 1821 + * (EOF) 1822 + * 1823 + * 2) with A+B+C > A+BS, the file has zero padding in range 1824 + * [A+B, A+BS] 1825 + * 1826 + * |< Block Size (BS) >|< Block Size (BS) >| 1827 + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| 1828 + * ^ ^ ^ ^ 1829 + * A A+B A+BS A+B+C 1830 + * (EOF) 1831 + * 1832 + * D = Valid Data 1833 + * 0 = Zero Padding 1834 + * 1835 + * Note that this defeats the ability to chain the ioends of 1836 + * appending writes. 1837 + */ 1797 1838 wpc->ioend->io_size += len; 1839 + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) 1840 + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; 1841 + 1798 1842 wbc_account_cgroup_owner(wbc, folio, len); 1799 1843 return 0; 1800 1844 } 1801 1845 1802 1846 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, 1803 1847 struct writeback_control *wbc, struct folio *folio, 1804 - struct inode *inode, u64 pos, unsigned dirty_len, 1805 - unsigned *count) 1848 + struct inode *inode, u64 pos, u64 end_pos, 1849 + unsigned dirty_len, unsigned *count) 1806 1850 { 1807 1851 int error; 1808 1852 ··· 1872 1826 break; 1873 1827 default: 1874 1828 error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, 1875 - map_len); 1829 + end_pos, map_len); 1876 1830 if (!error) 1877 1831 (*count)++; 1878 1832 break; ··· 1943 1897 * remaining memory is zeroed when mapped, and writes to that 1944 1898 * region are not written out to the file. 1945 1899 * 1946 - * Also adjust the writeback range to skip all blocks entirely 1947 - * beyond i_size. 1900 + * Also adjust the end_pos to the end of file and skip writeback 1901 + * for all blocks entirely beyond i_size. 1948 1902 */ 1949 1903 folio_zero_segment(folio, poff, folio_size(folio)); 1950 - *end_pos = round_up(isize, i_blocksize(inode)); 1904 + *end_pos = isize; 1951 1905 } 1952 1906 1953 1907 return true; ··· 1960 1914 struct inode *inode = folio->mapping->host; 1961 1915 u64 pos = folio_pos(folio); 1962 1916 u64 end_pos = pos + folio_size(folio); 1917 + u64 end_aligned = 0; 1963 1918 unsigned count = 0; 1964 1919 int error = 0; 1965 1920 u32 rlen; ··· 2002 1955 /* 2003 1956 * Walk through the folio to find dirty areas to write back. 2004 1957 */ 2005 - while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { 1958 + end_aligned = round_up(end_pos, i_blocksize(inode)); 1959 + while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { 2006 1960 error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, 2007 - pos, rlen, &count); 1961 + pos, end_pos, rlen, &count); 2008 1962 if (error) 2009 1963 break; 2010 1964 pos += rlen;
+1 -1
include/linux/iomap.h
··· 335 335 u16 io_type; 336 336 u16 io_flags; /* IOMAP_F_* */ 337 337 struct inode *io_inode; /* file being written to */ 338 - size_t io_size; /* size of the extent */ 338 + size_t io_size; /* size of data within eof */ 339 339 loff_t io_offset; /* offset in the file */ 340 340 sector_t io_sector; /* start sector of ioend */ 341 341 struct bio io_bio; /* MUST BE LAST! */