Merge tag 'xfs-for-linus-3.17-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs fixes from Dave Chinner:
"The fixes all address recently discovered data corruption issues.

The original Direct IO issue was discovered by Chris Mason @ Facebook
on a production workload which mixed buffered reads with direct reads
and writes IO to the same file. The fix for that exposed other issues
with page invalidation (exposed by millions of fsx operations) failing
due to dirty buffers beyond EOF.

Finally, the collapse_range code could also cause problems due to
racing writeback changing the extent map while it was being shifted
around. The commits for that problem are simple mitigation fixes that
prevent the problem from occuring. A more robust fix for 3.18 that
addresses the underlying problem is currently being worked on by
Brian.

Summary of fixes:
- a direct IO read/buffered read data corruption
- the associated fallout from the DIO data corruption fix
- collapse range bugs that are potential data corruption issues"

* tag 'xfs-for-linus-3.17-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
xfs: trim eofblocks before collapse range
xfs: xfs_file_collapse_range is delalloc challenged
xfs: don't log inode unless extent shift makes extent modifications
xfs: use ranged writeback and invalidation for direct IO
xfs: don't zero partial page cache pages during O_DIRECT writes
xfs: don't zero partial page cache pages during O_DIRECT writes
xfs: don't dirty buffers beyond EOF

Linus Torvalds 11 years ago 11e97398 925e0ea4

+114 -12

4 changed files

expand all

xfs

libxfs

xfs_bmap.c

xfs_aops.c

xfs_bmap_util.c

xfs_file.c

+10 -8

fs/xfs/libxfs/xfs_bmap.c

··· 5424 5424 struct xfs_bmap_free *flist, 5425 5425 int num_exts) 5426 5426 { 5427 - struct xfs_btree_cur *cur; 5427 + struct xfs_btree_cur *cur = NULL; 5428 5428 struct xfs_bmbt_rec_host *gotp; 5429 5429 struct xfs_bmbt_irec got; 5430 5430 struct xfs_bmbt_irec left; ··· 5435 5435 int error = 0; 5436 5436 int i; 5437 5437 int whichfork = XFS_DATA_FORK; 5438 - int logflags; 5438 + int logflags = 0; 5439 5439 xfs_filblks_t blockcount = 0; 5440 5440 int total_extents; 5441 5441 ··· 5478 5478 } 5479 5479 } 5480 5480 5481 - /* We are going to change core inode */ 5482 - logflags = XFS_ILOG_CORE; 5483 5481 if (ifp->if_flags & XFS_IFBROOT) { 5484 5482 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); 5485 5483 cur->bc_private.b.firstblock = *firstblock; 5486 5484 cur->bc_private.b.flist = flist; 5487 5485 cur->bc_private.b.flags = 0; 5488 - } else { 5489 - cur = NULL; 5490 - logflags |= XFS_ILOG_DEXT; 5491 5486 } 5492 5487 5493 5488 /* ··· 5540 5545 blockcount = left.br_blockcount + 5541 5546 got.br_blockcount; 5542 5547 xfs_iext_remove(ip, *current_ext, 1, 0); 5548 + logflags |= XFS_ILOG_CORE; 5543 5549 if (cur) { 5544 5550 error = xfs_btree_delete(cur, &i); 5545 5551 if (error) 5546 5552 goto del_cursor; 5547 5553 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); 5554 + } else { 5555 + logflags |= XFS_ILOG_DEXT; 5548 5556 } 5549 5557 XFS_IFORK_NEXT_SET(ip, whichfork, 5550 5558 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); ··· 5573 5575 got.br_startoff = startoff; 5574 5576 } 5575 5577 5578 + logflags |= XFS_ILOG_CORE; 5576 5579 if (cur) { 5577 5580 error = xfs_bmbt_update(cur, got.br_startoff, 5578 5581 got.br_startblock, ··· 5581 5582 got.br_state); 5582 5583 if (error) 5583 5584 goto del_cursor; 5585 + } else { 5586 + logflags |= XFS_ILOG_DEXT; 5584 5587 } 5585 5588 5586 5589 (*current_ext)++; ··· 5598 5597 xfs_btree_del_cursor(cur, 5599 5598 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 5600 5599 5601 - xfs_trans_log_inode(tp, ip, logflags); 5600 + if (logflags) 5601 + xfs_trans_log_inode(tp, ip, logflags); 5602 5602 return error; 5603 5603 }

+61

fs/xfs/xfs_aops.c

··· 1753 1753 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1754 1754 } 1755 1755 1756 + /* 1757 + * This is basically a copy of __set_page_dirty_buffers() with one 1758 + * small tweak: buffers beyond EOF do not get marked dirty. If we mark them 1759 + * dirty, we'll never be able to clean them because we don't write buffers 1760 + * beyond EOF, and that means we can't invalidate pages that span EOF 1761 + * that have been marked dirty. Further, the dirty state can leak into 1762 + * the file interior if the file is extended, resulting in all sorts of 1763 + * bad things happening as the state does not match the underlying data. 1764 + * 1765 + * XXX: this really indicates that bufferheads in XFS need to die. Warts like 1766 + * this only exist because of bufferheads and how the generic code manages them. 1767 + */ 1768 + STATIC int 1769 + xfs_vm_set_page_dirty( 1770 + struct page *page) 1771 + { 1772 + struct address_space *mapping = page->mapping; 1773 + struct inode *inode = mapping->host; 1774 + loff_t end_offset; 1775 + loff_t offset; 1776 + int newly_dirty; 1777 + 1778 + if (unlikely(!mapping)) 1779 + return !TestSetPageDirty(page); 1780 + 1781 + end_offset = i_size_read(inode); 1782 + offset = page_offset(page); 1783 + 1784 + spin_lock(&mapping->private_lock); 1785 + if (page_has_buffers(page)) { 1786 + struct buffer_head *head = page_buffers(page); 1787 + struct buffer_head *bh = head; 1788 + 1789 + do { 1790 + if (offset < end_offset) 1791 + set_buffer_dirty(bh); 1792 + bh = bh->b_this_page; 1793 + offset += 1 << inode->i_blkbits; 1794 + } while (bh != head); 1795 + } 1796 + newly_dirty = !TestSetPageDirty(page); 1797 + spin_unlock(&mapping->private_lock); 1798 + 1799 + if (newly_dirty) { 1800 + /* sigh - __set_page_dirty() is static, so copy it here, too */ 1801 + unsigned long flags; 1802 + 1803 + spin_lock_irqsave(&mapping->tree_lock, flags); 1804 + if (page->mapping) { /* Race with truncate? */ 1805 + WARN_ON_ONCE(!PageUptodate(page)); 1806 + account_page_dirtied(page, mapping); 1807 + radix_tree_tag_set(&mapping->page_tree, 1808 + page_index(page), PAGECACHE_TAG_DIRTY); 1809 + } 1810 + spin_unlock_irqrestore(&mapping->tree_lock, flags); 1811 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1812 + } 1813 + return newly_dirty; 1814 + } 1815 + 1756 1816 const struct address_space_operations xfs_address_space_operations = { 1757 1817 .readpage = xfs_vm_readpage, 1758 1818 .readpages = xfs_vm_readpages, 1759 1819 .writepage = xfs_vm_writepage, 1760 1820 .writepages = xfs_vm_writepages, 1821 + .set_page_dirty = xfs_vm_set_page_dirty, 1761 1822 .releasepage = xfs_vm_releasepage, 1762 1823 .invalidatepage = xfs_vm_invalidatepage, 1763 1824 .write_begin = xfs_vm_write_begin,

+20

fs/xfs/xfs_bmap_util.c

··· 1470 1470 start_fsb = XFS_B_TO_FSB(mp, offset + len); 1471 1471 shift_fsb = XFS_B_TO_FSB(mp, len); 1472 1472 1473 + /* 1474 + * Writeback the entire file and force remove any post-eof blocks. The 1475 + * writeback prevents changes to the extent list via concurrent 1476 + * writeback and the eofblocks trim prevents the extent shift algorithm 1477 + * from running into a post-eof delalloc extent. 1478 + * 1479 + * XXX: This is a temporary fix until the extent shift loop below is 1480 + * converted to use offsets and lookups within the ILOCK rather than 1481 + * carrying around the index into the extent list for the next 1482 + * iteration. 1483 + */ 1484 + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); 1485 + if (error) 1486 + return error; 1487 + if (xfs_can_free_eofblocks(ip, true)) { 1488 + error = xfs_free_eofblocks(mp, ip, false); 1489 + if (error) 1490 + return error; 1491 + } 1492 + 1473 1493 error = xfs_free_file_space(ip, offset, len); 1474 1494 if (error) 1475 1495 return error;

+23 -4

fs/xfs/xfs_file.c

··· 291 291 if (inode->i_mapping->nrpages) { 292 292 ret = filemap_write_and_wait_range( 293 293 VFS_I(ip)->i_mapping, 294 - pos, -1); 294 + pos, pos + size - 1); 295 295 if (ret) { 296 296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 297 297 return ret; 298 298 } 299 - truncate_pagecache_range(VFS_I(ip), pos, -1); 299 + 300 + /* 301 + * Invalidate whole pages. This can return an error if 302 + * we fail to invalidate a page, but this should never 303 + * happen on XFS. Warn if it does fail. 304 + */ 305 + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, 306 + pos >> PAGE_CACHE_SHIFT, 307 + (pos + size - 1) >> PAGE_CACHE_SHIFT); 308 + WARN_ON_ONCE(ret); 309 + ret = 0; 300 310 } 301 311 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 302 312 } ··· 642 632 643 633 if (mapping->nrpages) { 644 634 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 645 - pos, -1); 635 + pos, pos + count - 1); 646 636 if (ret) 647 637 goto out; 648 - truncate_pagecache_range(VFS_I(ip), pos, -1); 638 + /* 639 + * Invalidate whole pages. This can return an error if 640 + * we fail to invalidate a page, but this should never 641 + * happen on XFS. Warn if it does fail. 642 + */ 643 + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, 644 + pos >> PAGE_CACHE_SHIFT, 645 + (pos + count - 1) >> PAGE_CACHE_SHIFT); 646 + WARN_ON_ONCE(ret); 647 + ret = 0; 649 648 } 650 649 651 650 /*

Configure Feed

Configure Feed