Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'xfs-5.14-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Darrick Wong:
"This contains a bunch of bug fixes in XFS.

Dave and I have been busy the last couple of weeks to find and fix as
many log recovery bugs as we can find; here are the results so far. Go
fstests -g recoveryloop! ;)

- Fix a number of coordination bugs relating to cache flushes for
metadata writeback, cache flushes for multi-buffer log writes, and
FUA writes for single-buffer log writes

- Fix a bug with incorrect replay of attr3 blocks

- Fix unnecessary stalls when flushing logs to disk

- Fix spoofing problems when recovering realtime bitmap blocks"

* tag 'xfs-5.14-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
xfs: prevent spoofing of rtbitmap blocks when recovering buffers
xfs: limit iclog tail updates
xfs: need to see iclog flags in tracing
xfs: Enforce attr3 buffer recovery order
xfs: logging the on disk inode LSN can make it go backwards
xfs: avoid unnecessary waits in xfs_log_force_lsn()
xfs: log forces imply data device cache flushes
xfs: factor out forced iclog flushes
xfs: fix ordering violation between cache flushes and tail updates
xfs: fold __xlog_state_release_iclog into xlog_state_release_iclog
xfs: external logs need to flush data device
xfs: flush data dev on external log write

+244 -106
+10 -1
fs/xfs/libxfs/xfs_log_format.h
··· 411 411 /* start of the extended dinode, writable fields */ 412 412 uint32_t di_crc; /* CRC of the inode */ 413 413 uint64_t di_changecount; /* number of attribute changes */ 414 - xfs_lsn_t di_lsn; /* flush sequence */ 414 + 415 + /* 416 + * The LSN we write to this field during formatting is not a reflection 417 + * of the current on-disk LSN. It should never be used for recovery 418 + * sequencing, nor should it be recovered into the on-disk inode at all. 419 + * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk() 420 + * for details. 421 + */ 422 + xfs_lsn_t di_lsn; 423 + 415 424 uint64_t di_flags2; /* more random flags */ 416 425 uint32_t di_cowextsize; /* basic cow extent size for file */ 417 426 uint8_t di_pad2[12]; /* more padding for future expansion */
+13 -2
fs/xfs/xfs_buf_item_recover.c
··· 698 698 static xfs_lsn_t 699 699 xlog_recover_get_buf_lsn( 700 700 struct xfs_mount *mp, 701 - struct xfs_buf *bp) 701 + struct xfs_buf *bp, 702 + struct xfs_buf_log_format *buf_f) 702 703 { 703 704 uint32_t magic32; 704 705 uint16_t magic16; ··· 707 706 void *blk = bp->b_addr; 708 707 uuid_t *uuid; 709 708 xfs_lsn_t lsn = -1; 709 + uint16_t blft; 710 710 711 711 /* v4 filesystems always recover immediately */ 712 712 if (!xfs_sb_version_hascrc(&mp->m_sb)) 713 + goto recover_immediately; 714 + 715 + /* 716 + * realtime bitmap and summary file blocks do not have magic numbers or 717 + * UUIDs, so we must recover them immediately. 718 + */ 719 + blft = xfs_blft_from_flags(buf_f); 720 + if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) 713 721 goto recover_immediately; 714 722 715 723 magic32 = be32_to_cpu(*(__be32 *)blk); ··· 806 796 switch (magicda) { 807 797 case XFS_DIR3_LEAF1_MAGIC: 808 798 case XFS_DIR3_LEAFN_MAGIC: 799 + case XFS_ATTR3_LEAF_MAGIC: 809 800 case XFS_DA3_NODE_MAGIC: 810 801 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 811 802 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; ··· 930 919 * the verifier will be reset to match whatever recover turns that 931 920 * buffer into. 932 921 */ 933 - lsn = xlog_recover_get_buf_lsn(mp, bp); 922 + lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); 934 923 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 935 924 trace_xfs_log_recover_buf_skip(log, buf_f); 936 925 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+29 -10
fs/xfs/xfs_inode_item_recover.c
··· 145 145 STATIC void 146 146 xfs_log_dinode_to_disk( 147 147 struct xfs_log_dinode *from, 148 - struct xfs_dinode *to) 148 + struct xfs_dinode *to, 149 + xfs_lsn_t lsn) 149 150 { 150 151 to->di_magic = cpu_to_be16(from->di_magic); 151 152 to->di_mode = cpu_to_be16(from->di_mode); ··· 183 182 to->di_flags2 = cpu_to_be64(from->di_flags2); 184 183 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 185 184 to->di_ino = cpu_to_be64(from->di_ino); 186 - to->di_lsn = cpu_to_be64(from->di_lsn); 185 + to->di_lsn = cpu_to_be64(lsn); 187 186 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 188 187 uuid_copy(&to->di_uuid, &from->di_uuid); 189 188 to->di_flushiter = 0; ··· 262 261 } 263 262 264 263 /* 265 - * If the inode has an LSN in it, recover the inode only if it's less 266 - * than the lsn of the transaction we are replaying. Note: we still 267 - * need to replay an owner change even though the inode is more recent 268 - * than the transaction as there is no guarantee that all the btree 269 - * blocks are more recent than this transaction, too. 264 + * If the inode has an LSN in it, recover the inode only if the on-disk 265 + * inode's LSN is older than the lsn of the transaction we are 266 + * replaying. We can have multiple checkpoints with the same start LSN, 267 + * so the current LSN being equal to the on-disk LSN doesn't necessarily 268 + * mean that the on-disk inode is more recent than the change being 269 + * replayed. 270 + * 271 + * We must check the current_lsn against the on-disk inode 272 + * here because the we can't trust the log dinode to contain a valid LSN 273 + * (see comment below before replaying the log dinode for details). 274 + * 275 + * Note: we still need to replay an owner change even though the inode 276 + * is more recent than the transaction as there is no guarantee that all 277 + * the btree blocks are more recent than this transaction, too. 270 278 */ 271 279 if (dip->di_version >= 3) { 272 280 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 273 281 274 - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 282 + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { 275 283 trace_xfs_log_recover_inode_skip(log, in_f); 276 284 error = 0; 277 285 goto out_owner_change; ··· 378 368 goto out_release; 379 369 } 380 370 381 - /* recover the log dinode inode into the on disk inode */ 382 - xfs_log_dinode_to_disk(ldip, dip); 371 + /* 372 + * Recover the log dinode inode into the on disk inode. 373 + * 374 + * The LSN in the log dinode is garbage - it can be zero or reflect 375 + * stale in-memory runtime state that isn't coherent with the changes 376 + * logged in this transaction or the changes written to the on-disk 377 + * inode. Hence we write the current lSN into the inode because that 378 + * matches what xfs_iflush() would write inode the inode when flushing 379 + * the changes in this transaction. 380 + */ 381 + xfs_log_dinode_to_disk(ldip, dip, current_lsn); 383 382 384 383 fields = in_f->ilf_fields; 385 384 if (fields & XFS_ILOG_DEV)
+165 -86
fs/xfs/xfs_log.c
··· 78 78 STATIC void 79 79 xlog_verify_tail_lsn( 80 80 struct xlog *log, 81 - struct xlog_in_core *iclog, 82 - xfs_lsn_t tail_lsn); 81 + struct xlog_in_core *iclog); 83 82 #else 84 83 #define xlog_verify_dest_ptr(a,b) 85 84 #define xlog_verify_grant_tail(a) 86 85 #define xlog_verify_iclog(a,b,c) 87 - #define xlog_verify_tail_lsn(a,b,c) 86 + #define xlog_verify_tail_lsn(a,b) 88 87 #endif 89 88 90 89 STATIC int ··· 486 487 return error; 487 488 } 488 489 489 - static bool 490 - __xlog_state_release_iclog( 491 - struct xlog *log, 492 - struct xlog_in_core *iclog) 493 - { 494 - lockdep_assert_held(&log->l_icloglock); 495 - 496 - if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 497 - /* update tail before writing to iclog */ 498 - xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); 499 - 500 - iclog->ic_state = XLOG_STATE_SYNCING; 501 - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); 502 - xlog_verify_tail_lsn(log, iclog, tail_lsn); 503 - /* cycle incremented when incrementing curr_block */ 504 - trace_xlog_iclog_syncing(iclog, _RET_IP_); 505 - return true; 506 - } 507 - 508 - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 509 - return false; 510 - } 511 - 512 490 /* 513 491 * Flush iclog to disk if this is the last reference to the given iclog and the 514 492 * it is in the WANT_SYNC state. 493 + * 494 + * If the caller passes in a non-zero @old_tail_lsn and the current log tail 495 + * does not match, there may be metadata on disk that must be persisted before 496 + * this iclog is written. To satisfy that requirement, set the 497 + * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new 498 + * log tail value. 499 + * 500 + * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the 501 + * log tail is updated correctly. NEED_FUA indicates that the iclog will be 502 + * written to stable storage, and implies that a commit record is contained 503 + * within the iclog. We need to ensure that the log tail does not move beyond 504 + * the tail that the first commit record in the iclog ordered against, otherwise 505 + * correct recovery of that checkpoint becomes dependent on future operations 506 + * performed on this iclog. 507 + * 508 + * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the 509 + * current tail into iclog. Once the iclog tail is set, future operations must 510 + * not modify it, otherwise they potentially violate ordering constraints for 511 + * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in 512 + * the iclog will get zeroed on activation of the iclog after sync, so we 513 + * always capture the tail lsn on the iclog on the first NEED_FUA release 514 + * regardless of the number of active reference counts on this iclog. 515 515 */ 516 + 516 517 int 517 518 xlog_state_release_iclog( 518 519 struct xlog *log, 519 - struct xlog_in_core *iclog) 520 + struct xlog_in_core *iclog, 521 + xfs_lsn_t old_tail_lsn) 520 522 { 523 + xfs_lsn_t tail_lsn; 521 524 lockdep_assert_held(&log->l_icloglock); 522 525 523 526 trace_xlog_iclog_release(iclog, _RET_IP_); 524 527 if (iclog->ic_state == XLOG_STATE_IOERROR) 525 528 return -EIO; 526 529 527 - if (atomic_dec_and_test(&iclog->ic_refcnt) && 528 - __xlog_state_release_iclog(log, iclog)) { 529 - spin_unlock(&log->l_icloglock); 530 - xlog_sync(log, iclog); 531 - spin_lock(&log->l_icloglock); 530 + /* 531 + * Grabbing the current log tail needs to be atomic w.r.t. the writing 532 + * of the tail LSN into the iclog so we guarantee that the log tail does 533 + * not move between deciding if a cache flush is required and writing 534 + * the LSN into the iclog below. 535 + */ 536 + if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { 537 + tail_lsn = xlog_assign_tail_lsn(log->l_mp); 538 + 539 + if (old_tail_lsn && tail_lsn != old_tail_lsn) 540 + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; 541 + 542 + if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && 543 + !iclog->ic_header.h_tail_lsn) 544 + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); 532 545 } 533 546 547 + if (!atomic_dec_and_test(&iclog->ic_refcnt)) 548 + return 0; 549 + 550 + if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { 551 + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 552 + return 0; 553 + } 554 + 555 + iclog->ic_state = XLOG_STATE_SYNCING; 556 + if (!iclog->ic_header.h_tail_lsn) 557 + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); 558 + xlog_verify_tail_lsn(log, iclog); 559 + trace_xlog_iclog_syncing(iclog, _RET_IP_); 560 + 561 + spin_unlock(&log->l_icloglock); 562 + xlog_sync(log, iclog); 563 + spin_lock(&log->l_icloglock); 534 564 return 0; 535 565 } 536 566 ··· 802 774 } 803 775 804 776 /* 777 + * Flush out the iclog to disk ensuring that device caches are flushed and 778 + * the iclog hits stable storage before any completion waiters are woken. 779 + */ 780 + static inline int 781 + xlog_force_iclog( 782 + struct xlog_in_core *iclog) 783 + { 784 + atomic_inc(&iclog->ic_refcnt); 785 + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; 786 + if (iclog->ic_state == XLOG_STATE_ACTIVE) 787 + xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); 788 + return xlog_state_release_iclog(iclog->ic_log, iclog, 0); 789 + } 790 + 791 + /* 805 792 * Wait for the iclog and all prior iclogs to be written disk as required by the 806 793 * log force state machine. Waiting on ic_force_wait ensures iclog completions 807 794 * have been ordered and callbacks run before we are woken here, hence ··· 870 827 /* account for space used by record data */ 871 828 ticket->t_curr_res -= sizeof(ulf); 872 829 873 - /* 874 - * For external log devices, we need to flush the data device cache 875 - * first to ensure all metadata writeback is on stable storage before we 876 - * stamp the tail LSN into the unmount record. 877 - */ 878 - if (log->l_targ != log->l_mp->m_ddev_targp) 879 - blkdev_issue_flush(log->l_targ->bt_bdev); 880 830 return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); 881 831 } 882 832 ··· 901 865 902 866 spin_lock(&log->l_icloglock); 903 867 iclog = log->l_iclog; 904 - atomic_inc(&iclog->ic_refcnt); 905 - if (iclog->ic_state == XLOG_STATE_ACTIVE) 906 - xlog_state_switch_iclogs(log, iclog, 0); 907 - else 908 - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || 909 - iclog->ic_state == XLOG_STATE_IOERROR); 910 - /* 911 - * Ensure the journal is fully flushed and on stable storage once the 912 - * iclog containing the unmount record is written. 913 - */ 914 - iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); 915 - error = xlog_state_release_iclog(log, iclog); 868 + error = xlog_force_iclog(iclog); 916 869 xlog_wait_on_iclog(iclog); 917 870 918 871 if (tic) { ··· 1821 1796 * metadata writeback and causing priority inversions. 1822 1797 */ 1823 1798 iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; 1824 - if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) 1799 + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) { 1825 1800 iclog->ic_bio.bi_opf |= REQ_PREFLUSH; 1801 + /* 1802 + * For external log devices, we also need to flush the data 1803 + * device cache first to ensure all metadata writeback covered 1804 + * by the LSN in this iclog is on stable storage. This is slow, 1805 + * but it *must* complete before we issue the external log IO. 1806 + */ 1807 + if (log->l_targ != log->l_mp->m_ddev_targp) 1808 + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev); 1809 + } 1826 1810 if (iclog->ic_flags & XLOG_ICL_NEED_FUA) 1827 1811 iclog->ic_bio.bi_opf |= REQ_FUA; 1812 + 1828 1813 iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); 1829 1814 1830 1815 if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { ··· 2345 2310 return 0; 2346 2311 2347 2312 release_iclog: 2348 - error = xlog_state_release_iclog(log, iclog); 2313 + error = xlog_state_release_iclog(log, iclog, 0); 2349 2314 spin_unlock(&log->l_icloglock); 2350 2315 return error; 2351 2316 } ··· 2564 2529 ASSERT(optype & XLOG_COMMIT_TRANS); 2565 2530 *commit_iclog = iclog; 2566 2531 } else { 2567 - error = xlog_state_release_iclog(log, iclog); 2532 + error = xlog_state_release_iclog(log, iclog, 0); 2568 2533 } 2569 2534 spin_unlock(&log->l_icloglock); 2570 2535 ··· 2602 2567 memset(iclog->ic_header.h_cycle_data, 0, 2603 2568 sizeof(iclog->ic_header.h_cycle_data)); 2604 2569 iclog->ic_header.h_lsn = 0; 2570 + iclog->ic_header.h_tail_lsn = 0; 2605 2571 } 2606 2572 2607 2573 /* ··· 3003 2967 * reference to the iclog. 3004 2968 */ 3005 2969 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) 3006 - error = xlog_state_release_iclog(log, iclog); 2970 + error = xlog_state_release_iclog(log, iclog, 0); 3007 2971 spin_unlock(&log->l_icloglock); 3008 2972 if (error) 3009 2973 return error; ··· 3168 3132 } 3169 3133 3170 3134 /* 3135 + * Force the iclog to disk and check if the iclog has been completed before 3136 + * xlog_force_iclog() returns. This can happen on synchronous (e.g. 3137 + * pmem) or fast async storage because we drop the icloglock to issue the IO. 3138 + * If completion has already occurred, tell the caller so that it can avoid an 3139 + * unnecessary wait on the iclog. 3140 + */ 3141 + static int 3142 + xlog_force_and_check_iclog( 3143 + struct xlog_in_core *iclog, 3144 + bool *completed) 3145 + { 3146 + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); 3147 + int error; 3148 + 3149 + *completed = false; 3150 + error = xlog_force_iclog(iclog); 3151 + if (error) 3152 + return error; 3153 + 3154 + /* 3155 + * If the iclog has already been completed and reused the header LSN 3156 + * will have been rewritten by completion 3157 + */ 3158 + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) 3159 + *completed = true; 3160 + return 0; 3161 + } 3162 + 3163 + /* 3171 3164 * Write out all data in the in-core log as of this exact moment in time. 3172 3165 * 3173 3166 * Data may be written to the in-core log during this call. However, ··· 3230 3165 { 3231 3166 struct xlog *log = mp->m_log; 3232 3167 struct xlog_in_core *iclog; 3233 - xfs_lsn_t lsn; 3234 3168 3235 3169 XFS_STATS_INC(mp, xs_log_force); 3236 3170 trace_xfs_log_force(mp, 0, _RET_IP_); ··· 3257 3193 iclog = iclog->ic_prev; 3258 3194 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3259 3195 if (atomic_read(&iclog->ic_refcnt) == 0) { 3260 - /* 3261 - * We are the only one with access to this iclog. 3262 - * 3263 - * Flush it out now. There should be a roundoff of zero 3264 - * to show that someone has already taken care of the 3265 - * roundoff from the previous sync. 3266 - */ 3267 - atomic_inc(&iclog->ic_refcnt); 3268 - lsn = be64_to_cpu(iclog->ic_header.h_lsn); 3269 - xlog_state_switch_iclogs(log, iclog, 0); 3270 - if (xlog_state_release_iclog(log, iclog)) 3196 + /* We have exclusive access to this iclog. */ 3197 + bool completed; 3198 + 3199 + if (xlog_force_and_check_iclog(iclog, &completed)) 3271 3200 goto out_error; 3272 3201 3273 - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) 3202 + if (completed) 3274 3203 goto out_unlock; 3275 3204 } else { 3276 3205 /* 3277 - * Someone else is writing to this iclog. 3278 - * 3279 - * Use its call to flush out the data. However, the 3280 - * other thread may not force out this LR, so we mark 3281 - * it WANT_SYNC. 3206 + * Someone else is still writing to this iclog, so we 3207 + * need to ensure that when they release the iclog it 3208 + * gets synced immediately as we may be waiting on it. 3282 3209 */ 3283 3210 xlog_state_switch_iclogs(log, iclog, 0); 3284 3211 } 3285 - } else { 3286 - /* 3287 - * If the head iclog is not active nor dirty, we just attach 3288 - * ourselves to the head and go to sleep if necessary. 3289 - */ 3290 - ; 3291 3212 } 3213 + 3214 + /* 3215 + * The iclog we are about to wait on may contain the checkpoint pushed 3216 + * by the above xlog_cil_force() call, but it may not have been pushed 3217 + * to disk yet. Like the ACTIVE case above, we need to make sure caches 3218 + * are flushed when this iclog is written. 3219 + */ 3220 + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) 3221 + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; 3292 3222 3293 3223 if (flags & XFS_LOG_SYNC) 3294 3224 return xlog_wait_on_iclog(iclog); ··· 3303 3245 bool already_slept) 3304 3246 { 3305 3247 struct xlog_in_core *iclog; 3248 + bool completed; 3306 3249 3307 3250 spin_lock(&log->l_icloglock); 3308 3251 iclog = log->l_iclog; ··· 3317 3258 goto out_unlock; 3318 3259 } 3319 3260 3320 - if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3261 + switch (iclog->ic_state) { 3262 + case XLOG_STATE_ACTIVE: 3321 3263 /* 3322 3264 * We sleep here if we haven't already slept (e.g. this is the 3323 3265 * first time we've looked at the correct iclog buf) and the ··· 3341 3281 &log->l_icloglock); 3342 3282 return -EAGAIN; 3343 3283 } 3344 - atomic_inc(&iclog->ic_refcnt); 3345 - xlog_state_switch_iclogs(log, iclog, 0); 3346 - if (xlog_state_release_iclog(log, iclog)) 3284 + if (xlog_force_and_check_iclog(iclog, &completed)) 3347 3285 goto out_error; 3348 3286 if (log_flushed) 3349 3287 *log_flushed = 1; 3288 + if (completed) 3289 + goto out_unlock; 3290 + break; 3291 + case XLOG_STATE_WANT_SYNC: 3292 + /* 3293 + * This iclog may contain the checkpoint pushed by the 3294 + * xlog_cil_force_seq() call, but there are other writers still 3295 + * accessing it so it hasn't been pushed to disk yet. Like the 3296 + * ACTIVE case above, we need to make sure caches are flushed 3297 + * when this iclog is written. 3298 + */ 3299 + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; 3300 + break; 3301 + default: 3302 + /* 3303 + * The entire checkpoint was written by the CIL force and is on 3304 + * its way to disk already. It will be stable when it 3305 + * completes, so we don't need to manipulate caches here at all. 3306 + * We just need to wait for completion if necessary. 3307 + */ 3308 + break; 3350 3309 } 3351 3310 3352 3311 if (flags & XFS_LOG_SYNC) ··· 3638 3559 STATIC void 3639 3560 xlog_verify_tail_lsn( 3640 3561 struct xlog *log, 3641 - struct xlog_in_core *iclog, 3642 - xfs_lsn_t tail_lsn) 3562 + struct xlog_in_core *iclog) 3643 3563 { 3644 - int blocks; 3564 + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); 3565 + int blocks; 3645 3566 3646 3567 if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { 3647 3568 blocks =
+11 -2
fs/xfs/xfs_log_cil.c
··· 654 654 struct xfs_trans_header thdr; 655 655 struct xfs_log_iovec lhdr; 656 656 struct xfs_log_vec lvhdr = { NULL }; 657 + xfs_lsn_t preflush_tail_lsn; 657 658 xfs_lsn_t commit_lsn; 658 - xfs_lsn_t push_seq; 659 + xfs_csn_t push_seq; 659 660 struct bio bio; 660 661 DECLARE_COMPLETION_ONSTACK(bdev_flush); 661 662 ··· 731 730 * because we hold the flush lock exclusively. Hence we can now issue 732 731 * a cache flush to ensure all the completed metadata in the journal we 733 732 * are about to overwrite is on stable storage. 733 + * 734 + * Because we are issuing this cache flush before we've written the 735 + * tail lsn to the iclog, we can have metadata IO completions move the 736 + * tail forwards between the completion of this flush and the iclog 737 + * being written. In this case, we need to re-issue the cache flush 738 + * before the iclog write. To detect whether the log tail moves, sample 739 + * the tail LSN *before* we issue the flush. 734 740 */ 741 + preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); 735 742 xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, 736 743 &bdev_flush); 737 744 ··· 950 941 * storage. 951 942 */ 952 943 commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; 953 - xlog_state_release_iclog(log, commit_iclog); 944 + xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn); 954 945 spin_unlock(&log->l_icloglock); 955 946 return; 956 947
+12 -4
fs/xfs/xfs_log_priv.h
··· 59 59 { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \ 60 60 { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" } 61 61 62 + /* 63 + * In core log flags 64 + */ 65 + #define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ 66 + #define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ 67 + 68 + #define XLOG_ICL_STRINGS \ 69 + { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ 70 + { XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" } 71 + 62 72 63 73 /* 64 74 * Log ticket flags ··· 152 142 #define XLOG_STATE_COVER_DONE2 4 153 143 154 144 #define XLOG_COVER_OPS 5 155 - 156 - #define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ 157 - #define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ 158 145 159 146 /* Ticket reservation region accounting */ 160 147 #define XLOG_TIC_LEN_MAX 15 ··· 504 497 void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); 505 498 void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); 506 499 507 - int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); 500 + int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, 501 + xfs_lsn_t log_tail_lsn); 508 502 509 503 /* 510 504 * When we crack an atomic LSN, we sample it first so that the value will not
+4 -1
fs/xfs/xfs_trace.h
··· 3944 3944 __field(uint32_t, state) 3945 3945 __field(int32_t, refcount) 3946 3946 __field(uint32_t, offset) 3947 + __field(uint32_t, flags) 3947 3948 __field(unsigned long long, lsn) 3948 3949 __field(unsigned long, caller_ip) 3949 3950 ), ··· 3953 3952 __entry->state = iclog->ic_state; 3954 3953 __entry->refcount = atomic_read(&iclog->ic_refcnt); 3955 3954 __entry->offset = iclog->ic_offset; 3955 + __entry->flags = iclog->ic_flags; 3956 3956 __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); 3957 3957 __entry->caller_ip = caller_ip; 3958 3958 ), 3959 - TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS", 3959 + TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS", 3960 3960 MAJOR(__entry->dev), MINOR(__entry->dev), 3961 3961 __print_symbolic(__entry->state, XLOG_STATE_STRINGS), 3962 3962 __entry->refcount, 3963 3963 __entry->offset, 3964 3964 __entry->lsn, 3965 + __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS), 3965 3966 (char *)__entry->caller_ip) 3966 3967 3967 3968 );