Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Darrick Wong:
"Dave and I have continued our work fixing corruption problems that can
be found when running long-term burn-in exercisers on xfs. Here are
some patches fixing most of the problems, but there will likely be
more. :/

- Numerous corruption fixes for copy on write

- Numerous corruption fixes for blocksize < pagesize writes

- Don't miscalculate AG reservations for small final AGs

- Fix page cache truncation to work properly for reflink and extent
shifting

- Fix use-after-free when retrying failed inode/dquot buffer logging

- Fix corruptions seen when using copy_file_range in directio mode"

* tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
iomap: readpages doesn't zero page tail beyond EOF
vfs: vfs_dedupe_file_range() doesn't return EOPNOTSUPP
iomap: dio data corruption and spurious errors when pipes fill
iomap: sub-block dio needs to zeroout beyond EOF
iomap: FUA is wrong for DIO O_DSYNC writes into unwritten extents
xfs: delalloc -> unwritten COW fork allocation can go wrong
xfs: flush removing page cache in xfs_reflink_remap_prep
xfs: extent shifting doesn't fully invalidate page cache
xfs: finobt AG reserves don't consider last AG can be a runt
xfs: fix transient reference count error in xfs_buf_resubmit_failed_buffers
xfs: uncached buffer tracing needs to print bno
xfs: make xfs_file_remap_range() static
xfs: fix shared extent data corruption due to missing cow reservation

+104 -46
+41 -12
fs/iomap.c
··· 142 142 iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, 143 143 loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) 144 144 { 145 + loff_t orig_pos = *pos; 146 + loff_t isize = i_size_read(inode); 145 147 unsigned block_bits = inode->i_blkbits; 146 148 unsigned block_size = (1 << block_bits); 147 149 unsigned poff = offset_in_page(*pos); 148 150 unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); 149 151 unsigned first = poff >> block_bits; 150 152 unsigned last = (poff + plen - 1) >> block_bits; 151 - unsigned end = offset_in_page(i_size_read(inode)) >> block_bits; 152 153 153 154 /* 154 155 * If the block size is smaller than the page size we need to check the ··· 184 183 * handle both halves separately so that we properly zero data in the 185 184 * page cache for blocks that are entirely outside of i_size. 186 185 */ 187 - if (first <= end && last > end) 188 - plen -= (last - end) * block_size; 186 + if (orig_pos <= isize && orig_pos + length > isize) { 187 + unsigned end = offset_in_page(isize - 1) >> block_bits; 188 + 189 + if (first <= end && last > end) 190 + plen -= (last - end) * block_size; 191 + } 189 192 190 193 *offp = poff; 191 194 *lenp = plen; ··· 1585 1580 struct bio *bio; 1586 1581 bool need_zeroout = false; 1587 1582 bool use_fua = false; 1588 - int nr_pages, ret; 1583 + int nr_pages, ret = 0; 1589 1584 size_t copied = 0; 1590 1585 1591 1586 if ((pos | length | align) & ((1 << blkbits) - 1)) ··· 1601 1596 1602 1597 if (iomap->flags & IOMAP_F_NEW) { 1603 1598 need_zeroout = true; 1604 - } else { 1599 + } else if (iomap->type == IOMAP_MAPPED) { 1605 1600 /* 1606 - * Use a FUA write if we need datasync semantics, this 1607 - * is a pure data IO that doesn't require any metadata 1608 - * updates and the underlying device supports FUA. This 1609 - * allows us to avoid cache flushes on IO completion. 1601 + * Use a FUA write if we need datasync semantics, this is a pure 1602 + * data IO that doesn't require any metadata updates (including 1603 + * after IO completion such as unwritten extent conversion) and 1604 + * the underlying device supports FUA. This allows us to avoid 1605 + * cache flushes on IO completion. 1610 1606 */ 1611 1607 if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && 1612 1608 (dio->flags & IOMAP_DIO_WRITE_FUA) && ··· 1650 1644 1651 1645 ret = bio_iov_iter_get_pages(bio, &iter); 1652 1646 if (unlikely(ret)) { 1647 + /* 1648 + * We have to stop part way through an IO. We must fall 1649 + * through to the sub-block tail zeroing here, otherwise 1650 + * this short IO may expose stale data in the tail of 1651 + * the block we haven't written data to. 1652 + */ 1653 1653 bio_put(bio); 1654 - return copied ? copied : ret; 1654 + goto zero_tail; 1655 1655 } 1656 1656 1657 1657 n = bio->bi_iter.bi_size; ··· 1688 1676 dio->submit.cookie = submit_bio(bio); 1689 1677 } while (nr_pages); 1690 1678 1691 - if (need_zeroout) { 1679 + /* 1680 + * We need to zeroout the tail of a sub-block write if the extent type 1681 + * requires zeroing or the write extends beyond EOF. If we don't zero 1682 + * the block tail in the latter case, we can expose stale data via mmap 1683 + * reads of the EOF block. 1684 + */ 1685 + zero_tail: 1686 + if (need_zeroout || 1687 + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { 1692 1688 /* zero out from the end of the write to the end of the block */ 1693 1689 pad = pos & (fs_block_size - 1); 1694 1690 if (pad) 1695 1691 iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); 1696 1692 } 1697 - return copied; 1693 + return copied ? copied : ret; 1698 1694 } 1699 1695 1700 1696 static loff_t ··· 1877 1857 dio->wait_for_completion = true; 1878 1858 ret = 0; 1879 1859 } 1860 + 1861 + /* 1862 + * Splicing to pipes can fail on a full pipe. We have to 1863 + * swallow this to make it look like a short IO 1864 + * otherwise the higher splice layers will completely 1865 + * mishandle the error and stop moving data. 1866 + */ 1867 + if (ret == -EFAULT) 1868 + ret = 0; 1880 1869 break; 1881 1870 } 1882 1871 pos += ret;
+7 -8
fs/read_write.c
··· 2094 2094 off = same->src_offset; 2095 2095 len = same->src_length; 2096 2096 2097 - ret = -EISDIR; 2098 2097 if (S_ISDIR(src->i_mode)) 2099 - goto out; 2098 + return -EISDIR; 2100 2099 2101 - ret = -EINVAL; 2102 2100 if (!S_ISREG(src->i_mode)) 2103 - goto out; 2101 + return -EINVAL; 2102 + 2103 + if (!file->f_op->remap_file_range) 2104 + return -EOPNOTSUPP; 2104 2105 2105 2106 ret = remap_verify_area(file, off, len, false); 2106 2107 if (ret < 0) 2107 - goto out; 2108 + return ret; 2108 2109 ret = 0; 2109 2110 2110 2111 if (off + len > i_size_read(src)) ··· 2148 2147 fdput(dst_fd); 2149 2148 next_loop: 2150 2149 if (fatal_signal_pending(current)) 2151 - goto out; 2150 + break; 2152 2151 } 2153 - 2154 - out: 2155 2152 return ret; 2156 2153 } 2157 2154 EXPORT_SYMBOL(vfs_dedupe_file_range);
+4 -1
fs/xfs/libxfs/xfs_bmap.c
··· 1694 1694 case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: 1695 1695 /* 1696 1696 * Filling in all of a previously delayed allocation extent. 1697 - * The right neighbor is contiguous, the left is not. 1697 + * The right neighbor is contiguous, the left is not. Take care 1698 + * with delay -> unwritten extent allocation here because the 1699 + * delalloc record we are overwriting is always written. 1698 1700 */ 1699 1701 PREV.br_startblock = new->br_startblock; 1700 1702 PREV.br_blockcount += RIGHT.br_blockcount; 1703 + PREV.br_state = new->br_state; 1701 1704 1702 1705 xfs_iext_next(ifp, &bma->icur); 1703 1706 xfs_iext_remove(bma->ip, &bma->icur, state);
+7 -4
fs/xfs/libxfs/xfs_ialloc_btree.c
··· 538 538 539 539 static xfs_extlen_t 540 540 xfs_inobt_max_size( 541 - struct xfs_mount *mp) 541 + struct xfs_mount *mp, 542 + xfs_agnumber_t agno) 542 543 { 544 + xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); 545 + 543 546 /* Bail out if we're uninitialized, which can happen in mkfs. */ 544 547 if (mp->m_inobt_mxr[0] == 0) 545 548 return 0; 546 549 547 550 return xfs_btree_calc_size(mp->m_inobt_mnr, 548 - (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / 549 - XFS_INODES_PER_CHUNK); 551 + (uint64_t)agblocks * mp->m_sb.sb_inopblock / 552 + XFS_INODES_PER_CHUNK); 550 553 } 551 554 552 555 static int ··· 597 594 if (error) 598 595 return error; 599 596 600 - *ask += xfs_inobt_max_size(mp); 597 + *ask += xfs_inobt_max_size(mp, agno); 601 598 *used += tree_len; 602 599 return 0; 603 600 }
+2 -8
fs/xfs/xfs_bmap_util.c
··· 1042 1042 goto out_unlock; 1043 1043 } 1044 1044 1045 - static int 1045 + int 1046 1046 xfs_flush_unmap_range( 1047 1047 struct xfs_inode *ip, 1048 1048 xfs_off_t offset, ··· 1195 1195 * Writeback and invalidate cache for the remainder of the file as we're 1196 1196 * about to shift down every extent from offset to EOF. 1197 1197 */ 1198 - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); 1199 - if (error) 1200 - return error; 1201 - error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, 1202 - offset >> PAGE_SHIFT, -1); 1203 - if (error) 1204 - return error; 1198 + error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); 1205 1199 1206 1200 /* 1207 1201 * Clean out anything hanging around in the cow fork now that
+3
fs/xfs/xfs_bmap_util.h
··· 80 80 int whichfork, xfs_extnum_t *nextents, 81 81 xfs_filblks_t *count); 82 82 83 + int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, 84 + xfs_off_t len); 85 + 83 86 #endif /* __XFS_BMAP_UTIL_H__ */
+21 -7
fs/xfs/xfs_buf_item.c
··· 1233 1233 } 1234 1234 1235 1235 /* 1236 - * Requeue a failed buffer for writeback 1236 + * Requeue a failed buffer for writeback. 1237 1237 * 1238 - * Return true if the buffer has been re-queued properly, false otherwise 1238 + * We clear the log item failed state here as well, but we have to be careful 1239 + * about reference counts because the only active reference counts on the buffer 1240 + * may be the failed log items. Hence if we clear the log item failed state 1241 + * before queuing the buffer for IO we can release all active references to 1242 + * the buffer and free it, leading to use after free problems in 1243 + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which 1244 + * order we process them in - the buffer is locked, and we own the buffer list 1245 + * so nothing on them is going to change while we are performing this action. 1246 + * 1247 + * Hence we can safely queue the buffer for IO before we clear the failed log 1248 + * item state, therefore always having an active reference to the buffer and 1249 + * avoiding the transient zero-reference state that leads to use-after-free. 1250 + * 1251 + * Return true if the buffer was added to the buffer list, false if it was 1252 + * already on the buffer list. 1239 1253 */ 1240 1254 bool 1241 1255 xfs_buf_resubmit_failed_buffers( ··· 1257 1243 struct list_head *buffer_list) 1258 1244 { 1259 1245 struct xfs_log_item *lip; 1246 + bool ret; 1247 + 1248 + ret = xfs_buf_delwri_queue(bp, buffer_list); 1260 1249 1261 1250 /* 1262 - * Clear XFS_LI_FAILED flag from all items before resubmit 1263 - * 1264 - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this 1251 + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this 1265 1252 * function already have it acquired 1266 1253 */ 1267 1254 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) 1268 1255 xfs_clear_li_failed(lip); 1269 1256 1270 - /* Add this buffer back to the delayed write list */ 1271 - return xfs_buf_delwri_queue(bp, buffer_list); 1257 + return ret; 1272 1258 }
+1 -1
fs/xfs/xfs_file.c
··· 920 920 } 921 921 922 922 923 - loff_t 923 + STATIC loff_t 924 924 xfs_file_remap_range( 925 925 struct file *file_in, 926 926 loff_t pos_in,
+14 -4
fs/xfs/xfs_reflink.c
··· 296 296 if (error) 297 297 return error; 298 298 299 + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); 299 300 trace_xfs_reflink_cow_alloc(ip, &got); 300 301 return 0; 301 302 } ··· 1352 1351 if (ret) 1353 1352 goto out_unlock; 1354 1353 1355 - /* Zap any page cache for the destination file's range. */ 1356 - truncate_inode_pages_range(&inode_out->i_data, 1357 - round_down(pos_out, PAGE_SIZE), 1358 - round_up(pos_out + *len, PAGE_SIZE) - 1); 1354 + /* 1355 + * If pos_out > EOF, we may have dirtied blocks between EOF and 1356 + * pos_out. In that case, we need to extend the flush and unmap to cover 1357 + * from EOF to the end of the copy length. 1358 + */ 1359 + if (pos_out > XFS_ISIZE(dest)) { 1360 + loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); 1361 + ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); 1362 + } else { 1363 + ret = xfs_flush_unmap_range(dest, pos_out, *len); 1364 + } 1365 + if (ret) 1366 + goto out_unlock; 1359 1367 1360 1368 return 1; 1361 1369 out_unlock:
+4 -1
fs/xfs/xfs_trace.h
··· 280 280 ), 281 281 TP_fast_assign( 282 282 __entry->dev = bp->b_target->bt_dev; 283 - __entry->bno = bp->b_bn; 283 + if (bp->b_bn == XFS_BUF_DADDR_NULL) 284 + __entry->bno = bp->b_maps[0].bm_bn; 285 + else 286 + __entry->bno = bp->b_bn; 284 287 __entry->nblks = bp->b_length; 285 288 __entry->hold = atomic_read(&bp->b_hold); 286 289 __entry->pincount = atomic_read(&bp->b_pin_count);