fs/ext4/fast_commit.c at ed78aeebef05212ef7dca93bd931e4eff67c113f

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / fs / ext4 / fast_commit.c
at ed78aeebef05212ef7dca93bd931e4eff67c113f 2351 lines 69 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15#include <linux/lockdep.h>
  16/*
  17 * Ext4 Fast Commits
  18 * -----------------
  19 *
  20 * Ext4 fast commits implement fine grained journalling for Ext4.
  21 *
  22 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  23 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  24 * TLV during the recovery phase. For the scenarios for which we currently
  25 * don't have replay code, fast commit falls back to full commits.
  26 * Fast commits record delta in one of the following three categories.
  27 *
  28 * (A) Directory entry updates:
  29 *
  30 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  31 * - EXT4_FC_TAG_LINK		- records directory entry link
  32 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  33 *
  34 * (B) File specific data range updates:
  35 *
  36 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  37 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  38 *
  39 * (C) Inode metadata (mtime / ctime etc):
  40 *
  41 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  42 *				  during recovery. Note that iblocks field is
  43 *				  not replayed and instead derived during
  44 *				  replay.
  45 * Commit Operation
  46 * ----------------
  47 * With fast commits, we maintain all the directory entry operations in the
  48 * order in which they are issued in an in-memory queue. This queue is flushed
  49 * to disk during the commit operation. We also maintain a list of inodes
  50 * that need to be committed during a fast commit in another in memory queue of
  51 * inodes. During the commit operation, we commit in the following order:
  52 *
  53 * [1] Prepare all the inodes to write out their data by setting
  54 *     "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
  55 *     deleted while it is being flushed.
  56 * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
  57 *     state.
  58 * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
  59 *     all the exsiting handles finish and no new handles can start.
  60 * [4] Mark all the fast commit eligible inodes as undergoing fast commit
  61 *     by setting "EXT4_STATE_FC_COMMITTING" state.
  62 * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
  63 *     starting of new handles. If new handles try to start an update on
  64 *     any of the inodes that are being committed, ext4_fc_track_inode()
  65 *     will block until those inodes have finished the fast commit.
  66 * [6] Commit all the directory entry updates in the fast commit space.
  67 * [7] Commit all the changed inodes in the fast commit space and clear
  68 *     "EXT4_STATE_FC_COMMITTING" for these inodes.
  69 * [8] Write tail tag (this tag ensures the atomicity, please read the following
  70 *     section for more details).
  71 *
  72 * All the inode updates must be enclosed within jbd2_jounrnal_start()
  73 * and jbd2_journal_stop() similar to JBD2 journaling.
  74 *
  75 * Fast Commit Ineligibility
  76 * -------------------------
  77 *
  78 * Not all operations are supported by fast commits today (e.g extended
  79 * attributes). Fast commit ineligibility is marked by calling
  80 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
  81 * to full commit.
  82 *
  83 * Atomicity of commits
  84 * --------------------
  85 * In order to guarantee atomicity during the commit operation, fast commit
  86 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  87 * tag contains CRC of the contents and TID of the transaction after which
  88 * this fast commit should be applied. Recovery code replays fast commit
  89 * logs only if there's at least 1 valid tail present. For every fast commit
  90 * operation, there is 1 tail. This means, we may end up with multiple tails
  91 * in the fast commit space. Here's an example:
  92 *
  93 * - Create a new file A and remove existing file B
  94 * - fsync()
  95 * - Append contents to file A
  96 * - Truncate file A
  97 * - fsync()
  98 *
  99 * The fast commit space at the end of above operations would look like this:
 100 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 101 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 102 *
 103 * Replay code should thus check for all the valid tails in the FC area.
 104 *
 105 * Fast Commit Replay Idempotence
 106 * ------------------------------
 107 *
 108 * Fast commits tags are idempotent in nature provided the recovery code follows
 109 * certain rules. The guiding principle that the commit path follows while
 110 * committing is that it stores the result of a particular operation instead of
 111 * storing the procedure.
 112 *
 113 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 114 * was associated with inode 10. During fast commit, instead of storing this
 115 * operation as a procedure "rename a to b", we store the resulting file system
 116 * state as a "series" of outcomes:
 117 *
 118 * - Link dirent b to inode 10
 119 * - Unlink dirent a
 120 * - Inode <10> with valid refcount
 121 *
 122 * Now when recovery code runs, it needs "enforce" this state on the file
 123 * system. This is what guarantees idempotence of fast commit replay.
 124 *
 125 * Let's take an example of a procedure that is not idempotent and see how fast
 126 * commits make it idempotent. Consider following sequence of operations:
 127 *
 128 *     rm A;    mv B A;    read A
 129 *  (x)     (y)        (z)
 130 *
 131 * (x), (y) and (z) are the points at which we can crash. If we store this
 132 * sequence of operations as is then the replay is not idempotent. Let's say
 133 * while in replay, we crash at (z). During the second replay, file A (which was
 134 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 135 * file named A would be absent when we try to read A. So, this sequence of
 136 * operations is not idempotent. However, as mentioned above, instead of storing
 137 * the procedure fast commits store the outcome of each procedure. Thus the fast
 138 * commit log for above procedure would be as follows:
 139 *
 140 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 141 * inode 11 before the replay)
 142 *
 143 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 144 * (w)          (x)                    (y)          (z)
 145 *
 146 * If we crash at (z), we will have file A linked to inode 11. During the second
 147 * replay, we will remove file A (inode 11). But we will create it back and make
 148 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 149 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 150 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 151 * similarly. Thus, by converting a non-idempotent procedure into a series of
 152 * idempotent outcomes, fast commits ensured idempotence during the replay.
 153 *
 154 * Locking
 155 * -------
 156 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
 157 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
 158 * inode. Most of the code avoids acquiring both the locks, but if one must do
 159 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
 160 *
 161 * TODOs
 162 * -----
 163 *
 164 * 0) Fast commit replay path hardening: Fast commit replay code should use
 165 *    journal handles to make sure all the updates it does during the replay
 166 *    path are atomic. With that if we crash during fast commit replay, after
 167 *    trying to do recovery again, we will find a file system where fast commit
 168 *    area is invalid (because new full commit would be found). In order to deal
 169 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 170 *    superblock state is persisted before starting the replay, so that after
 171 *    the crash, fast commit recovery code can look at that flag and perform
 172 *    fast commit recovery even if that area is invalidated by later full
 173 *    commits.
 174 *
 175 * 1) Handle more ineligible cases.
 176 *
 177 * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
 178 *    status tree. This would get rid of the need to call ext4_fc_track_inode()
 179 *    before acquiring i_data_sem. To do that we would need to ensure that
 180 *    modified extents from the extent status tree are not evicted from memory.
 181 */
 182
 183#include <trace/events/ext4.h>
 184static struct kmem_cache *ext4_fc_dentry_cachep;
 185
 186static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 187{
 188	BUFFER_TRACE(bh, "");
 189	if (uptodate) {
 190		ext4_debug("%s: Block %lld up-to-date",
 191			   __func__, bh->b_blocknr);
 192		set_buffer_uptodate(bh);
 193	} else {
 194		ext4_debug("%s: Block %lld not up-to-date",
 195			   __func__, bh->b_blocknr);
 196		clear_buffer_uptodate(bh);
 197	}
 198
 199	unlock_buffer(bh);
 200}
 201
 202static inline void ext4_fc_reset_inode(struct inode *inode)
 203{
 204	struct ext4_inode_info *ei = EXT4_I(inode);
 205
 206	ei->i_fc_lblk_start = 0;
 207	ei->i_fc_lblk_len = 0;
 208}
 209
 210void ext4_fc_init_inode(struct inode *inode)
 211{
 212	struct ext4_inode_info *ei = EXT4_I(inode);
 213
 214	ext4_fc_reset_inode(inode);
 215	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 216	INIT_LIST_HEAD(&ei->i_fc_list);
 217	INIT_LIST_HEAD(&ei->i_fc_dilist);
 218	init_waitqueue_head(&ei->i_fc_wait);
 219}
 220
 221static bool ext4_fc_disabled(struct super_block *sb)
 222{
 223	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 224		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
 225}
 226
 227/*
 228 * Remove inode from fast commit list. If the inode is being committed
 229 * we wait until inode commit is done.
 230 */
 231void ext4_fc_del(struct inode *inode)
 232{
 233	struct ext4_inode_info *ei = EXT4_I(inode);
 234	struct ext4_fc_dentry_update *fc_dentry;
 235	wait_queue_head_t *wq;
 236	int alloc_ctx;
 237
 238	if (ext4_fc_disabled(inode->i_sb))
 239		return;
 240
 241	alloc_ctx = ext4_fc_lock(inode->i_sb);
 242	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
 243		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 244		return;
 245	}
 246
 247	/*
 248	 * Since ext4_fc_del is called from ext4_evict_inode while having a
 249	 * handle open, there is no need for us to wait here even if a fast
 250	 * commit is going on. That is because, if this inode is being
 251	 * committed, ext4_mark_inode_dirty would have waited for inode commit
 252	 * operation to finish before we come here. So, by the time we come
 253	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
 254	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
 255	 * here.
 256	 *
 257	 * We may come here without any handles open in the "no_delete" case of
 258	 * ext4_evict_inode as well. However, if that happens, we first mark the
 259	 * file system as fast commit ineligible anyway. So, even in that case,
 260	 * it is okay to remove the inode from the fc list.
 261	 */
 262	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
 263		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
 264	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
 265#if (BITS_PER_LONG < 64)
 266		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 267				EXT4_STATE_FC_FLUSHING_DATA);
 268		wq = bit_waitqueue(&ei->i_state_flags,
 269				   EXT4_STATE_FC_FLUSHING_DATA);
 270#else
 271		DEFINE_WAIT_BIT(wait, &ei->i_flags,
 272				EXT4_STATE_FC_FLUSHING_DATA);
 273		wq = bit_waitqueue(&ei->i_flags,
 274				   EXT4_STATE_FC_FLUSHING_DATA);
 275#endif
 276		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 277		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
 278			ext4_fc_unlock(inode->i_sb, alloc_ctx);
 279			schedule();
 280			alloc_ctx = ext4_fc_lock(inode->i_sb);
 281		}
 282		finish_wait(wq, &wait.wq_entry);
 283	}
 284	list_del_init(&ei->i_fc_list);
 285
 286	/*
 287	 * Since this inode is getting removed, let's also remove all FC
 288	 * dentry create references, since it is not needed to log it anyways.
 289	 */
 290	if (list_empty(&ei->i_fc_dilist)) {
 291		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 292		return;
 293	}
 294
 295	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
 296	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 297	list_del_init(&fc_dentry->fcd_list);
 298	list_del_init(&fc_dentry->fcd_dilist);
 299
 300	WARN_ON(!list_empty(&ei->i_fc_dilist));
 301	ext4_fc_unlock(inode->i_sb, alloc_ctx);
 302
 303	release_dentry_name_snapshot(&fc_dentry->fcd_name);
 304	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 305}
 306
 307/*
 308 * Mark file system as fast commit ineligible, and record latest
 309 * ineligible transaction tid. This means until the recorded
 310 * transaction, commit operation would result in a full jbd2 commit.
 311 */
 312void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
 313{
 314	struct ext4_sb_info *sbi = EXT4_SB(sb);
 315	tid_t tid;
 316	bool has_transaction = true;
 317	bool is_ineligible;
 318	int alloc_ctx;
 319
 320	if (ext4_fc_disabled(sb))
 321		return;
 322
 323	if (handle && !IS_ERR(handle))
 324		tid = handle->h_transaction->t_tid;
 325	else {
 326		read_lock(&sbi->s_journal->j_state_lock);
 327		if (sbi->s_journal->j_running_transaction)
 328			tid = sbi->s_journal->j_running_transaction->t_tid;
 329		else
 330			has_transaction = false;
 331		read_unlock(&sbi->s_journal->j_state_lock);
 332	}
 333	alloc_ctx = ext4_fc_lock(sb);
 334	is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 335	if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
 336		sbi->s_fc_ineligible_tid = tid;
 337	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 338	ext4_fc_unlock(sb, alloc_ctx);
 339	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 340	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 341}
 342
 343/*
 344 * Generic fast commit tracking function. If this is the first time this we are
 345 * called after a full commit, we initialize fast commit fields and then call
 346 * __fc_track_fn() with update = 0. If we have already been called after a full
 347 * commit, we pass update = 1. Based on that, the track function can determine
 348 * if it needs to track a field for the first time or if it needs to just
 349 * update the previously tracked value.
 350 *
 351 * If enqueue is set, this function enqueues the inode in fast commit list.
 352 */
 353static int ext4_fc_track_template(
 354	handle_t *handle, struct inode *inode,
 355	int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
 356	void *args, int enqueue)
 357{
 358	bool update = false;
 359	struct ext4_inode_info *ei = EXT4_I(inode);
 360	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 361	tid_t tid = 0;
 362	int alloc_ctx;
 363	int ret;
 364
 365	tid = handle->h_transaction->t_tid;
 366	spin_lock(&ei->i_fc_lock);
 367	if (tid == ei->i_sync_tid) {
 368		update = true;
 369	} else {
 370		ext4_fc_reset_inode(inode);
 371		ei->i_sync_tid = tid;
 372	}
 373	ret = __fc_track_fn(handle, inode, args, update);
 374	spin_unlock(&ei->i_fc_lock);
 375	if (!enqueue)
 376		return ret;
 377
 378	alloc_ctx = ext4_fc_lock(inode->i_sb);
 379	if (list_empty(&EXT4_I(inode)->i_fc_list))
 380		list_add_tail(&EXT4_I(inode)->i_fc_list,
 381				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 382				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
 383				&sbi->s_fc_q[FC_Q_STAGING] :
 384				&sbi->s_fc_q[FC_Q_MAIN]);
 385	ext4_fc_unlock(inode->i_sb, alloc_ctx);
 386
 387	return ret;
 388}
 389
 390struct __track_dentry_update_args {
 391	struct dentry *dentry;
 392	int op;
 393};
 394
 395/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 396static int __track_dentry_update(handle_t *handle, struct inode *inode,
 397				 void *arg, bool update)
 398{
 399	struct ext4_fc_dentry_update *node;
 400	struct ext4_inode_info *ei = EXT4_I(inode);
 401	struct __track_dentry_update_args *dentry_update =
 402		(struct __track_dentry_update_args *)arg;
 403	struct dentry *dentry = dentry_update->dentry;
 404	struct inode *dir = dentry->d_parent->d_inode;
 405	struct super_block *sb = inode->i_sb;
 406	struct ext4_sb_info *sbi = EXT4_SB(sb);
 407	int alloc_ctx;
 408
 409	spin_unlock(&ei->i_fc_lock);
 410
 411	if (IS_ENCRYPTED(dir)) {
 412		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
 413					handle);
 414		spin_lock(&ei->i_fc_lock);
 415		return -EOPNOTSUPP;
 416	}
 417
 418	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 419	if (!node) {
 420		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
 421		spin_lock(&ei->i_fc_lock);
 422		return -ENOMEM;
 423	}
 424
 425	node->fcd_op = dentry_update->op;
 426	node->fcd_parent = dir->i_ino;
 427	node->fcd_ino = inode->i_ino;
 428	take_dentry_name_snapshot(&node->fcd_name, dentry);
 429	INIT_LIST_HEAD(&node->fcd_dilist);
 430	INIT_LIST_HEAD(&node->fcd_list);
 431	alloc_ctx = ext4_fc_lock(sb);
 432	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 433		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
 434		list_add_tail(&node->fcd_list,
 435				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 436	else
 437		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 438
 439	/*
 440	 * This helps us keep a track of all fc_dentry updates which is part of
 441	 * this ext4 inode. So in case the inode is getting unlinked, before
 442	 * even we get a chance to fsync, we could remove all fc_dentry
 443	 * references while evicting the inode in ext4_fc_del().
 444	 * Also with this, we don't need to loop over all the inodes in
 445	 * sbi->s_fc_q to get the corresponding inode in
 446	 * ext4_fc_commit_dentry_updates().
 447	 */
 448	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
 449		WARN_ON(!list_empty(&ei->i_fc_dilist));
 450		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
 451	}
 452	ext4_fc_unlock(sb, alloc_ctx);
 453	spin_lock(&ei->i_fc_lock);
 454
 455	return 0;
 456}
 457
 458void __ext4_fc_track_unlink(handle_t *handle,
 459		struct inode *inode, struct dentry *dentry)
 460{
 461	struct __track_dentry_update_args args;
 462	int ret;
 463
 464	args.dentry = dentry;
 465	args.op = EXT4_FC_TAG_UNLINK;
 466
 467	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 468					(void *)&args, 0);
 469	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
 470}
 471
 472void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 473{
 474	struct inode *inode = d_inode(dentry);
 475
 476	if (ext4_fc_disabled(inode->i_sb))
 477		return;
 478
 479	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 480		return;
 481
 482	__ext4_fc_track_unlink(handle, inode, dentry);
 483}
 484
 485void __ext4_fc_track_link(handle_t *handle,
 486	struct inode *inode, struct dentry *dentry)
 487{
 488	struct __track_dentry_update_args args;
 489	int ret;
 490
 491	args.dentry = dentry;
 492	args.op = EXT4_FC_TAG_LINK;
 493
 494	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 495					(void *)&args, 0);
 496	trace_ext4_fc_track_link(handle, inode, dentry, ret);
 497}
 498
 499void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 500{
 501	struct inode *inode = d_inode(dentry);
 502
 503	if (ext4_fc_disabled(inode->i_sb))
 504		return;
 505
 506	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 507		return;
 508
 509	__ext4_fc_track_link(handle, inode, dentry);
 510}
 511
 512void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 513			  struct dentry *dentry)
 514{
 515	struct __track_dentry_update_args args;
 516	int ret;
 517
 518	args.dentry = dentry;
 519	args.op = EXT4_FC_TAG_CREAT;
 520
 521	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 522					(void *)&args, 0);
 523	trace_ext4_fc_track_create(handle, inode, dentry, ret);
 524}
 525
 526void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 527{
 528	struct inode *inode = d_inode(dentry);
 529
 530	if (ext4_fc_disabled(inode->i_sb))
 531		return;
 532
 533	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 534		return;
 535
 536	__ext4_fc_track_create(handle, inode, dentry);
 537}
 538
 539/* __track_fn for inode tracking */
 540static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
 541			 bool update)
 542{
 543	if (update)
 544		return -EEXIST;
 545
 546	EXT4_I(inode)->i_fc_lblk_len = 0;
 547
 548	return 0;
 549}
 550
 551void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 552{
 553	struct ext4_inode_info *ei = EXT4_I(inode);
 554	wait_queue_head_t *wq;
 555	int ret;
 556
 557	if (S_ISDIR(inode->i_mode))
 558		return;
 559
 560	if (ext4_fc_disabled(inode->i_sb))
 561		return;
 562
 563	if (ext4_should_journal_data(inode)) {
 564		ext4_fc_mark_ineligible(inode->i_sb,
 565					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
 566		return;
 567	}
 568
 569	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 570		return;
 571
 572	/*
 573	 * If we come here, we may sleep while waiting for the inode to
 574	 * commit. We shouldn't be holding i_data_sem when we go to sleep since
 575	 * the commit path needs to grab the lock while committing the inode.
 576	 */
 577	lockdep_assert_not_held(&ei->i_data_sem);
 578
 579	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 580#if (BITS_PER_LONG < 64)
 581		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 582				EXT4_STATE_FC_COMMITTING);
 583		wq = bit_waitqueue(&ei->i_state_flags,
 584				   EXT4_STATE_FC_COMMITTING);
 585#else
 586		DEFINE_WAIT_BIT(wait, &ei->i_flags,
 587				EXT4_STATE_FC_COMMITTING);
 588		wq = bit_waitqueue(&ei->i_flags,
 589				   EXT4_STATE_FC_COMMITTING);
 590#endif
 591		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 592		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
 593			schedule();
 594		finish_wait(wq, &wait.wq_entry);
 595	}
 596
 597	/*
 598	 * From this point on, this inode will not be committed either
 599	 * by fast or full commit as long as the handle is open.
 600	 */
 601	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 602	trace_ext4_fc_track_inode(handle, inode, ret);
 603}
 604
 605struct __track_range_args {
 606	ext4_lblk_t start, end;
 607};
 608
 609/* __track_fn for tracking data updates */
 610static int __track_range(handle_t *handle, struct inode *inode, void *arg,
 611			 bool update)
 612{
 613	struct ext4_inode_info *ei = EXT4_I(inode);
 614	ext4_lblk_t oldstart;
 615	struct __track_range_args *__arg =
 616		(struct __track_range_args *)arg;
 617
 618	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 619		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 620		return -ECANCELED;
 621	}
 622
 623	oldstart = ei->i_fc_lblk_start;
 624
 625	if (update && ei->i_fc_lblk_len > 0) {
 626		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 627		ei->i_fc_lblk_len =
 628			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 629				ei->i_fc_lblk_start + 1;
 630	} else {
 631		ei->i_fc_lblk_start = __arg->start;
 632		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 633	}
 634
 635	return 0;
 636}
 637
 638void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 639			 ext4_lblk_t end)
 640{
 641	struct __track_range_args args;
 642	int ret;
 643
 644	if (S_ISDIR(inode->i_mode))
 645		return;
 646
 647	if (ext4_fc_disabled(inode->i_sb))
 648		return;
 649
 650	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 651		return;
 652
 653	if (ext4_has_inline_data(inode)) {
 654		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
 655					handle);
 656		return;
 657	}
 658
 659	args.start = start;
 660	args.end = end;
 661
 662	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 663
 664	trace_ext4_fc_track_range(handle, inode, start, end, ret);
 665}
 666
 667static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 668{
 669	blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
 670	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 671
 672	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 673	if (test_opt(sb, BARRIER) && is_tail)
 674		write_flags |= REQ_FUA | REQ_PREFLUSH;
 675	lock_buffer(bh);
 676	set_buffer_dirty(bh);
 677	set_buffer_uptodate(bh);
 678	bh->b_end_io = ext4_end_buffer_io_sync;
 679	submit_bh(REQ_OP_WRITE | write_flags, bh);
 680	EXT4_SB(sb)->s_fc_bh = NULL;
 681}
 682
 683/* Ext4 commit path routines */
 684
 685/*
 686 * Allocate len bytes on a fast commit buffer.
 687 *
 688 * During the commit time this function is used to manage fast commit
 689 * block space. We don't split a fast commit log onto different
 690 * blocks. So this function makes sure that if there's not enough space
 691 * on the current block, the remaining space in the current block is
 692 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 693 * new block is from jbd2 and CRC is updated to reflect the padding
 694 * we added.
 695 */
 696static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 697{
 698	struct ext4_fc_tl tl;
 699	struct ext4_sb_info *sbi = EXT4_SB(sb);
 700	struct buffer_head *bh;
 701	int bsize = sbi->s_journal->j_blocksize;
 702	int ret, off = sbi->s_fc_bytes % bsize;
 703	int remaining;
 704	u8 *dst;
 705
 706	/*
 707	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
 708	 * cannot fulfill the request.
 709	 */
 710	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
 711		return NULL;
 712
 713	if (!sbi->s_fc_bh) {
 714		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 715		if (ret)
 716			return NULL;
 717		sbi->s_fc_bh = bh;
 718	}
 719	dst = sbi->s_fc_bh->b_data + off;
 720
 721	/*
 722	 * Allocate the bytes in the current block if we can do so while still
 723	 * leaving enough space for a PAD tlv.
 724	 */
 725	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
 726	if (len <= remaining) {
 727		sbi->s_fc_bytes += len;
 728		return dst;
 729	}
 730
 731	/*
 732	 * Else, terminate the current block with a PAD tlv, then allocate a new
 733	 * block and allocate the bytes at the start of that new block.
 734	 */
 735
 736	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 737	tl.fc_len = cpu_to_le16(remaining);
 738	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 739	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
 740	*crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
 741
 742	ext4_fc_submit_bh(sb, false);
 743
 744	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 745	if (ret)
 746		return NULL;
 747	sbi->s_fc_bh = bh;
 748	sbi->s_fc_bytes += bsize - off + len;
 749	return sbi->s_fc_bh->b_data;
 750}
 751
 752/*
 753 * Complete a fast commit by writing tail tag.
 754 *
 755 * Writing tail tag marks the end of a fast commit. In order to guarantee
 756 * atomicity, after writing tail tag, even if there's space remaining
 757 * in the block, next commit shouldn't use it. That's why tail tag
 758 * has the length as that of the remaining space on the block.
 759 */
 760static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 761{
 762	struct ext4_sb_info *sbi = EXT4_SB(sb);
 763	struct ext4_fc_tl tl;
 764	struct ext4_fc_tail tail;
 765	int off, bsize = sbi->s_journal->j_blocksize;
 766	u8 *dst;
 767
 768	/*
 769	 * ext4_fc_reserve_space takes care of allocating an extra block if
 770	 * there's no enough space on this block for accommodating this tail.
 771	 */
 772	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
 773	if (!dst)
 774		return -ENOSPC;
 775
 776	off = sbi->s_fc_bytes % bsize;
 777
 778	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 779	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
 780	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 781
 782	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 783	dst += EXT4_FC_TAG_BASE_LEN;
 784	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 785	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
 786	dst += sizeof(tail.fc_tid);
 787	crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
 788			  dst - (u8 *)sbi->s_fc_bh->b_data);
 789	tail.fc_crc = cpu_to_le32(crc);
 790	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
 791	dst += sizeof(tail.fc_crc);
 792	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 793
 794	ext4_fc_submit_bh(sb, true);
 795
 796	return 0;
 797}
 798
 799/*
 800 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 801 * Returns false if there's not enough space.
 802 */
 803static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 804			   u32 *crc)
 805{
 806	struct ext4_fc_tl tl;
 807	u8 *dst;
 808
 809	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
 810	if (!dst)
 811		return false;
 812
 813	tl.fc_tag = cpu_to_le16(tag);
 814	tl.fc_len = cpu_to_le16(len);
 815
 816	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 817	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
 818
 819	return true;
 820}
 821
 822/* Same as above, but adds dentry tlv. */
 823static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 824				   struct ext4_fc_dentry_update *fc_dentry)
 825{
 826	struct ext4_fc_dentry_info fcd;
 827	struct ext4_fc_tl tl;
 828	int dlen = fc_dentry->fcd_name.name.len;
 829	u8 *dst = ext4_fc_reserve_space(sb,
 830			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
 831
 832	if (!dst)
 833		return false;
 834
 835	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 836	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 837	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 838	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 839	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 840	dst += EXT4_FC_TAG_BASE_LEN;
 841	memcpy(dst, &fcd, sizeof(fcd));
 842	dst += sizeof(fcd);
 843	memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
 844
 845	return true;
 846}
 847
 848/*
 849 * Writes inode in the fast commit space under TLV with tag @tag.
 850 * Returns 0 on success, error on failure.
 851 */
 852static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 853{
 854	struct ext4_inode_info *ei = EXT4_I(inode);
 855	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 856	int ret;
 857	struct ext4_iloc iloc;
 858	struct ext4_fc_inode fc_inode;
 859	struct ext4_fc_tl tl;
 860	u8 *dst;
 861
 862	ret = ext4_get_inode_loc(inode, &iloc);
 863	if (ret)
 864		return ret;
 865
 866	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 867		inode_len = EXT4_INODE_SIZE(inode->i_sb);
 868	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 869		inode_len += ei->i_extra_isize;
 870
 871	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 872	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 873	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 874
 875	ret = -ECANCELED;
 876	dst = ext4_fc_reserve_space(inode->i_sb,
 877		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
 878	if (!dst)
 879		goto err;
 880
 881	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 882	dst += EXT4_FC_TAG_BASE_LEN;
 883	memcpy(dst, &fc_inode, sizeof(fc_inode));
 884	dst += sizeof(fc_inode);
 885	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
 886	ret = 0;
 887err:
 888	brelse(iloc.bh);
 889	return ret;
 890}
 891
 892/*
 893 * Writes updated data ranges for the inode in question. Updates CRC.
 894 * Returns 0 on success, error otherwise.
 895 */
 896static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 897{
 898	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 899	struct ext4_inode_info *ei = EXT4_I(inode);
 900	struct ext4_map_blocks map;
 901	struct ext4_fc_add_range fc_ext;
 902	struct ext4_fc_del_range lrange;
 903	struct ext4_extent *ex;
 904	int ret;
 905
 906	spin_lock(&ei->i_fc_lock);
 907	if (ei->i_fc_lblk_len == 0) {
 908		spin_unlock(&ei->i_fc_lock);
 909		return 0;
 910	}
 911	old_blk_size = ei->i_fc_lblk_start;
 912	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 913	ei->i_fc_lblk_len = 0;
 914	spin_unlock(&ei->i_fc_lock);
 915
 916	cur_lblk_off = old_blk_size;
 917	ext4_debug("will try writing %d to %d for inode %ld\n",
 918		   cur_lblk_off, new_blk_size, inode->i_ino);
 919
 920	while (cur_lblk_off <= new_blk_size) {
 921		map.m_lblk = cur_lblk_off;
 922		map.m_len = new_blk_size - cur_lblk_off + 1;
 923		ret = ext4_map_blocks(NULL, inode, &map,
 924				      EXT4_GET_BLOCKS_IO_SUBMIT |
 925				      EXT4_EX_NOCACHE);
 926		if (ret < 0)
 927			return -ECANCELED;
 928
 929		if (map.m_len == 0) {
 930			cur_lblk_off++;
 931			continue;
 932		}
 933
 934		if (ret == 0) {
 935			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 936			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 937			lrange.fc_len = cpu_to_le32(map.m_len);
 938			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 939					    sizeof(lrange), (u8 *)&lrange, crc))
 940				return -ENOSPC;
 941		} else {
 942			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 943				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 944
 945			/* Limit the number of blocks in one extent */
 946			map.m_len = min(max, map.m_len);
 947
 948			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 949			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 950			ex->ee_block = cpu_to_le32(map.m_lblk);
 951			ex->ee_len = cpu_to_le16(map.m_len);
 952			ext4_ext_store_pblock(ex, map.m_pblk);
 953			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 954				ext4_ext_mark_unwritten(ex);
 955			else
 956				ext4_ext_mark_initialized(ex);
 957			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 958					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 959				return -ENOSPC;
 960		}
 961
 962		cur_lblk_off += map.m_len;
 963	}
 964
 965	return 0;
 966}
 967
 968
 969/* Flushes data of all the inodes in the commit queue. */
 970static int ext4_fc_flush_data(journal_t *journal)
 971{
 972	struct super_block *sb = journal->j_private;
 973	struct ext4_sb_info *sbi = EXT4_SB(sb);
 974	struct ext4_inode_info *ei;
 975	int ret = 0;
 976
 977	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 978		ret = jbd2_submit_inode_data(journal, ei->jinode);
 979		if (ret)
 980			return ret;
 981	}
 982
 983	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 984		ret = jbd2_wait_inode_data(journal, ei->jinode);
 985		if (ret)
 986			return ret;
 987	}
 988
 989	return 0;
 990}
 991
 992/* Commit all the directory entry updates */
 993static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 994{
 995	struct super_block *sb = journal->j_private;
 996	struct ext4_sb_info *sbi = EXT4_SB(sb);
 997	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 998	struct inode *inode;
 999	struct ext4_inode_info *ei;
1000	int ret;
1001
1002	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1003		return 0;
1004	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1005				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1006		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1007			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1008				return -ENOSPC;
1009			continue;
1010		}
1011		/*
1012		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1013		 * corresponding inode. Also, the corresponding inode could have been
1014		 * deleted, in which case, we don't need to do anything.
1015		 */
1016		if (list_empty(&fc_dentry->fcd_dilist))
1017			continue;
1018		ei = list_first_entry(&fc_dentry->fcd_dilist,
1019				struct ext4_inode_info, i_fc_dilist);
1020		inode = &ei->vfs_inode;
1021		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1022
1023		/*
1024		 * We first write the inode and then the create dirent. This
1025		 * allows the recovery code to create an unnamed inode first
1026		 * and then link it to a directory entry. This allows us
1027		 * to use namei.c routines almost as is and simplifies
1028		 * the recovery code.
1029		 */
1030		ret = ext4_fc_write_inode(inode, crc);
1031		if (ret)
1032			return ret;
1033		ret = ext4_fc_write_inode_data(inode, crc);
1034		if (ret)
1035			return ret;
1036		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1037			return -ENOSPC;
1038	}
1039	return 0;
1040}
1041
1042static int ext4_fc_perform_commit(journal_t *journal)
1043{
1044	struct super_block *sb = journal->j_private;
1045	struct ext4_sb_info *sbi = EXT4_SB(sb);
1046	struct ext4_inode_info *iter;
1047	struct ext4_fc_head head;
1048	struct inode *inode;
1049	struct blk_plug plug;
1050	int ret = 0;
1051	u32 crc = 0;
1052	int alloc_ctx;
1053
1054	/*
1055	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
1056	 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
1057	 * freed until the data flush is over.
1058	 */
1059	alloc_ctx = ext4_fc_lock(sb);
1060	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1061		ext4_set_inode_state(&iter->vfs_inode,
1062				     EXT4_STATE_FC_FLUSHING_DATA);
1063	}
1064	ext4_fc_unlock(sb, alloc_ctx);
1065
1066	/* Step 2: Flush data for all the eligible inodes. */
1067	ret = ext4_fc_flush_data(journal);
1068
1069	/*
1070	 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
1071	 * any error from step 2. This ensures that waiters waiting on
1072	 * EXT4_STATE_FC_FLUSHING_DATA can resume.
1073	 */
1074	alloc_ctx = ext4_fc_lock(sb);
1075	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1076		ext4_clear_inode_state(&iter->vfs_inode,
1077				       EXT4_STATE_FC_FLUSHING_DATA);
1078#if (BITS_PER_LONG < 64)
1079		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
1080#else
1081		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
1082#endif
1083	}
1084
1085	/*
1086	 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
1087	 * the waiter checks the bit. Pairs with implicit barrier in
1088	 * prepare_to_wait() in ext4_fc_del().
1089	 */
1090	smp_mb();
1091	ext4_fc_unlock(sb, alloc_ctx);
1092
1093	/*
1094	 * If we encountered error in Step 2, return it now after clearing
1095	 * EXT4_STATE_FC_FLUSHING_DATA bit.
1096	 */
1097	if (ret)
1098		return ret;
1099
1100
1101	/* Step 4: Mark all inodes as being committed. */
1102	jbd2_journal_lock_updates(journal);
1103	/*
1104	 * The journal is now locked. No more handles can start and all the
1105	 * previous handles are now drained. We now mark the inodes on the
1106	 * commit queue as being committed.
1107	 */
1108	alloc_ctx = ext4_fc_lock(sb);
1109	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1110		ext4_set_inode_state(&iter->vfs_inode,
1111				     EXT4_STATE_FC_COMMITTING);
1112	}
1113	ext4_fc_unlock(sb, alloc_ctx);
1114	jbd2_journal_unlock_updates(journal);
1115
1116	/*
1117	 * Step 5: If file system device is different from journal device,
1118	 * issue a cache flush before we start writing fast commit blocks.
1119	 */
1120	if (journal->j_fs_dev != journal->j_dev)
1121		blkdev_issue_flush(journal->j_fs_dev);
1122
1123	blk_start_plug(&plug);
1124	alloc_ctx = ext4_fc_lock(sb);
1125	/* Step 6: Write fast commit blocks to disk. */
1126	if (sbi->s_fc_bytes == 0) {
1127		/*
1128		 * Step 6.1: Add a head tag only if this is the first fast
1129		 * commit in this TID.
1130		 */
1131		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1132		head.fc_tid = cpu_to_le32(
1133			sbi->s_journal->j_running_transaction->t_tid);
1134		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1135			(u8 *)&head, &crc)) {
1136			ret = -ENOSPC;
1137			goto out;
1138		}
1139	}
1140
1141	/* Step 6.2: Now write all the dentry updates. */
1142	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1143	if (ret)
1144		goto out;
1145
1146	/* Step 6.3: Now write all the changed inodes to disk. */
1147	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1148		inode = &iter->vfs_inode;
1149		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1150			continue;
1151
1152		ret = ext4_fc_write_inode_data(inode, &crc);
1153		if (ret)
1154			goto out;
1155		ret = ext4_fc_write_inode(inode, &crc);
1156		if (ret)
1157			goto out;
1158	}
1159	/* Step 6.4: Finally write tail tag to conclude this fast commit. */
1160	ret = ext4_fc_write_tail(sb, crc);
1161
1162out:
1163	ext4_fc_unlock(sb, alloc_ctx);
1164	blk_finish_plug(&plug);
1165	return ret;
1166}
1167
1168static void ext4_fc_update_stats(struct super_block *sb, int status,
1169				 u64 commit_time, int nblks, tid_t commit_tid)
1170{
1171	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1172
1173	ext4_debug("Fast commit ended with status = %d for tid %u",
1174			status, commit_tid);
1175	if (status == EXT4_FC_STATUS_OK) {
1176		stats->fc_num_commits++;
1177		stats->fc_numblks += nblks;
1178		if (likely(stats->s_fc_avg_commit_time))
1179			stats->s_fc_avg_commit_time =
1180				(commit_time +
1181				 stats->s_fc_avg_commit_time * 3) / 4;
1182		else
1183			stats->s_fc_avg_commit_time = commit_time;
1184	} else if (status == EXT4_FC_STATUS_FAILED ||
1185		   status == EXT4_FC_STATUS_INELIGIBLE) {
1186		if (status == EXT4_FC_STATUS_FAILED)
1187			stats->fc_failed_commits++;
1188		stats->fc_ineligible_commits++;
1189	} else {
1190		stats->fc_skipped_commits++;
1191	}
1192	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1193}
1194
1195/*
1196 * The main commit entry point. Performs a fast commit for transaction
1197 * commit_tid if needed. If it's not possible to perform a fast commit
1198 * due to various reasons, we fall back to full commit. Returns 0
1199 * on success, error otherwise.
1200 */
1201int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1202{
1203	struct super_block *sb = journal->j_private;
1204	struct ext4_sb_info *sbi = EXT4_SB(sb);
1205	int nblks = 0, ret, bsize = journal->j_blocksize;
1206	int subtid = atomic_read(&sbi->s_fc_subtid);
1207	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1208	ktime_t start_time, commit_time;
1209	int old_ioprio, journal_ioprio;
1210
1211	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1212		return jbd2_complete_transaction(journal, commit_tid);
1213
1214	trace_ext4_fc_commit_start(sb, commit_tid);
1215
1216	start_time = ktime_get();
1217	old_ioprio = get_current_ioprio();
1218
1219restart_fc:
1220	ret = jbd2_fc_begin_commit(journal, commit_tid);
1221	if (ret == -EALREADY) {
1222		/* There was an ongoing commit, check if we need to restart */
1223		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1224		    tid_gt(commit_tid, journal->j_commit_sequence))
1225			goto restart_fc;
1226		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1227				commit_tid);
1228		return 0;
1229	} else if (ret) {
1230		/*
1231		 * Commit couldn't start. Just update stats and perform a
1232		 * full commit.
1233		 */
1234		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1235				commit_tid);
1236		return jbd2_complete_transaction(journal, commit_tid);
1237	}
1238
1239	/*
1240	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1241	 * if we are fast commit ineligible.
1242	 */
1243	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1244		status = EXT4_FC_STATUS_INELIGIBLE;
1245		goto fallback;
1246	}
1247
1248	/*
1249	 * Now that we know that this thread is going to do a fast commit,
1250	 * elevate the priority to match that of the journal thread.
1251	 */
1252	if (journal->j_task->io_context)
1253		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
1254	else
1255		journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
1256	set_task_ioprio(current, journal_ioprio);
1257	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1258	ret = ext4_fc_perform_commit(journal);
1259	if (ret < 0) {
1260		status = EXT4_FC_STATUS_FAILED;
1261		goto fallback;
1262	}
1263	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1264	ret = jbd2_fc_wait_bufs(journal, nblks);
1265	if (ret < 0) {
1266		status = EXT4_FC_STATUS_FAILED;
1267		goto fallback;
1268	}
1269	atomic_inc(&sbi->s_fc_subtid);
1270	ret = jbd2_fc_end_commit(journal);
1271	set_task_ioprio(current, old_ioprio);
1272	/*
1273	 * weight the commit time higher than the average time so we
1274	 * don't react too strongly to vast changes in the commit time
1275	 */
1276	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1277	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1278	return ret;
1279
1280fallback:
1281	set_task_ioprio(current, old_ioprio);
1282	ret = jbd2_fc_end_commit_fallback(journal);
1283	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1284	return ret;
1285}
1286
1287/*
1288 * Fast commit cleanup routine. This is called after every fast commit and
1289 * full commit. full is true if we are called after a full commit.
1290 */
1291static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1292{
1293	struct super_block *sb = journal->j_private;
1294	struct ext4_sb_info *sbi = EXT4_SB(sb);
1295	struct ext4_inode_info *ei;
1296	struct ext4_fc_dentry_update *fc_dentry;
1297	int alloc_ctx;
1298
1299	if (full && sbi->s_fc_bh)
1300		sbi->s_fc_bh = NULL;
1301
1302	trace_ext4_fc_cleanup(journal, full, tid);
1303	jbd2_fc_release_bufs(journal);
1304
1305	alloc_ctx = ext4_fc_lock(sb);
1306	while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
1307		ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
1308					struct ext4_inode_info,
1309					i_fc_list);
1310		list_del_init(&ei->i_fc_list);
1311		ext4_clear_inode_state(&ei->vfs_inode,
1312				       EXT4_STATE_FC_COMMITTING);
1313		if (tid_geq(tid, ei->i_sync_tid)) {
1314			ext4_fc_reset_inode(&ei->vfs_inode);
1315		} else if (full) {
1316			/*
1317			 * We are called after a full commit, inode has been
1318			 * modified while the commit was running. Re-enqueue
1319			 * the inode into STAGING, which will then be splice
1320			 * back into MAIN. This cannot happen during
1321			 * fastcommit because the journal is locked all the
1322			 * time in that case (and tid doesn't increase so
1323			 * tid check above isn't reliable).
1324			 */
1325			list_add_tail(&ei->i_fc_list,
1326				      &sbi->s_fc_q[FC_Q_STAGING]);
1327		}
1328		/*
1329		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
1330		 * visible before we send the wakeup. Pairs with implicit
1331		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
1332		 */
1333		smp_mb();
1334#if (BITS_PER_LONG < 64)
1335		wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
1336#else
1337		wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
1338#endif
1339	}
1340
1341	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1342		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1343					     struct ext4_fc_dentry_update,
1344					     fcd_list);
1345		list_del_init(&fc_dentry->fcd_list);
1346		list_del_init(&fc_dentry->fcd_dilist);
1347
1348		release_dentry_name_snapshot(&fc_dentry->fcd_name);
1349		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1350	}
1351
1352	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1353				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1354	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1355				&sbi->s_fc_q[FC_Q_MAIN]);
1356
1357	if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1358		sbi->s_fc_ineligible_tid = 0;
1359		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1360	}
1361
1362	if (full)
1363		sbi->s_fc_bytes = 0;
1364	ext4_fc_unlock(sb, alloc_ctx);
1365	trace_ext4_fc_stats(sb);
1366}
1367
1368/* Ext4 Replay Path Routines */
1369
1370/* Helper struct for dentry replay routines */
1371struct dentry_info_args {
1372	int parent_ino, dname_len, ino, inode_len;
1373	char *dname;
1374};
1375
1376/* Same as struct ext4_fc_tl, but uses native endianness fields */
1377struct ext4_fc_tl_mem {
1378	u16 fc_tag;
1379	u16 fc_len;
1380};
1381
1382static inline void tl_to_darg(struct dentry_info_args *darg,
1383			      struct ext4_fc_tl_mem *tl, u8 *val)
1384{
1385	struct ext4_fc_dentry_info fcd;
1386
1387	memcpy(&fcd, val, sizeof(fcd));
1388
1389	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1390	darg->ino = le32_to_cpu(fcd.fc_ino);
1391	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1392	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1393}
1394
1395static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1396{
1397	struct ext4_fc_tl tl_disk;
1398
1399	memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1400	tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1401	tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1402}
1403
1404/* Unlink replay function */
1405static int ext4_fc_replay_unlink(struct super_block *sb,
1406				 struct ext4_fc_tl_mem *tl, u8 *val)
1407{
1408	struct inode *inode, *old_parent;
1409	struct qstr entry;
1410	struct dentry_info_args darg;
1411	int ret = 0;
1412
1413	tl_to_darg(&darg, tl, val);
1414
1415	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1416			darg.parent_ino, darg.dname_len);
1417
1418	entry.name = darg.dname;
1419	entry.len = darg.dname_len;
1420	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421
1422	if (IS_ERR(inode)) {
1423		ext4_debug("Inode %d not found", darg.ino);
1424		return 0;
1425	}
1426
1427	old_parent = ext4_iget(sb, darg.parent_ino,
1428				EXT4_IGET_NORMAL);
1429	if (IS_ERR(old_parent)) {
1430		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1431		iput(inode);
1432		return 0;
1433	}
1434
1435	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1436	/* -ENOENT ok coz it might not exist anymore. */
1437	if (ret == -ENOENT)
1438		ret = 0;
1439	iput(old_parent);
1440	iput(inode);
1441	return ret;
1442}
1443
1444static int ext4_fc_replay_link_internal(struct super_block *sb,
1445				struct dentry_info_args *darg,
1446				struct inode *inode)
1447{
1448	struct inode *dir = NULL;
1449	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1450	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1451	int ret = 0;
1452
1453	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1454	if (IS_ERR(dir)) {
1455		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1456		dir = NULL;
1457		goto out;
1458	}
1459
1460	dentry_dir = d_obtain_alias(dir);
1461	if (IS_ERR(dentry_dir)) {
1462		ext4_debug("Failed to obtain dentry");
1463		dentry_dir = NULL;
1464		goto out;
1465	}
1466
1467	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1468	if (!dentry_inode) {
1469		ext4_debug("Inode dentry not created.");
1470		ret = -ENOMEM;
1471		goto out;
1472	}
1473
1474	ret = __ext4_link(dir, inode, dentry_inode);
1475	/*
1476	 * It's possible that link already existed since data blocks
1477	 * for the dir in question got persisted before we crashed OR
1478	 * we replayed this tag and crashed before the entire replay
1479	 * could complete.
1480	 */
1481	if (ret && ret != -EEXIST) {
1482		ext4_debug("Failed to link\n");
1483		goto out;
1484	}
1485
1486	ret = 0;
1487out:
1488	if (dentry_dir) {
1489		d_drop(dentry_dir);
1490		dput(dentry_dir);
1491	} else if (dir) {
1492		iput(dir);
1493	}
1494	if (dentry_inode) {
1495		d_drop(dentry_inode);
1496		dput(dentry_inode);
1497	}
1498
1499	return ret;
1500}
1501
1502/* Link replay function */
1503static int ext4_fc_replay_link(struct super_block *sb,
1504			       struct ext4_fc_tl_mem *tl, u8 *val)
1505{
1506	struct inode *inode;
1507	struct dentry_info_args darg;
1508	int ret = 0;
1509
1510	tl_to_darg(&darg, tl, val);
1511	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1512			darg.parent_ino, darg.dname_len);
1513
1514	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1515	if (IS_ERR(inode)) {
1516		ext4_debug("Inode not found.");
1517		return 0;
1518	}
1519
1520	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1521	iput(inode);
1522	return ret;
1523}
1524
1525/*
1526 * Record all the modified inodes during replay. We use this later to setup
1527 * block bitmaps correctly.
1528 */
1529static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1530{
1531	struct ext4_fc_replay_state *state;
1532	int i;
1533
1534	state = &EXT4_SB(sb)->s_fc_replay_state;
1535	for (i = 0; i < state->fc_modified_inodes_used; i++)
1536		if (state->fc_modified_inodes[i] == ino)
1537			return 0;
1538	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1539		int *fc_modified_inodes;
1540
1541		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1542				sizeof(int) * (state->fc_modified_inodes_size +
1543				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1544				GFP_KERNEL);
1545		if (!fc_modified_inodes)
1546			return -ENOMEM;
1547		state->fc_modified_inodes = fc_modified_inodes;
1548		state->fc_modified_inodes_size +=
1549			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1550	}
1551	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1552	return 0;
1553}
1554
1555/*
1556 * Inode replay function
1557 */
1558static int ext4_fc_replay_inode(struct super_block *sb,
1559				struct ext4_fc_tl_mem *tl, u8 *val)
1560{
1561	struct ext4_fc_inode fc_inode;
1562	struct ext4_inode *raw_inode;
1563	struct ext4_inode *raw_fc_inode;
1564	struct inode *inode = NULL;
1565	struct ext4_iloc iloc;
1566	int inode_len, ino, ret, tag = tl->fc_tag;
1567	struct ext4_extent_header *eh;
1568	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1569
1570	memcpy(&fc_inode, val, sizeof(fc_inode));
1571
1572	ino = le32_to_cpu(fc_inode.fc_ino);
1573	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1574
1575	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1576	if (!IS_ERR(inode)) {
1577		ext4_ext_clear_bb(inode);
1578		iput(inode);
1579	}
1580	inode = NULL;
1581
1582	ret = ext4_fc_record_modified_inode(sb, ino);
1583	if (ret)
1584		goto out;
1585
1586	raw_fc_inode = (struct ext4_inode *)
1587		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1588	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1589	if (ret)
1590		goto out;
1591
1592	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1593	raw_inode = ext4_raw_inode(&iloc);
1594
1595	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1596	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1597	       inode_len - off_gen);
1598	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1599		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1600		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1601			memset(eh, 0, sizeof(*eh));
1602			eh->eh_magic = EXT4_EXT_MAGIC;
1603			eh->eh_max = cpu_to_le16(
1604				(sizeof(raw_inode->i_block) -
1605				 sizeof(struct ext4_extent_header))
1606				 / sizeof(struct ext4_extent));
1607		}
1608	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1609		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1610			sizeof(raw_inode->i_block));
1611	}
1612
1613	/* Immediately update the inode on disk. */
1614	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1615	if (ret)
1616		goto out;
1617	ret = sync_dirty_buffer(iloc.bh);
1618	if (ret)
1619		goto out;
1620	ret = ext4_mark_inode_used(sb, ino);
1621	if (ret)
1622		goto out;
1623
1624	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1625	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1626	if (IS_ERR(inode)) {
1627		ext4_debug("Inode not found.");
1628		return -EFSCORRUPTED;
1629	}
1630
1631	/*
1632	 * Our allocator could have made different decisions than before
1633	 * crashing. This should be fixed but until then, we calculate
1634	 * the number of blocks the inode.
1635	 */
1636	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1637		ext4_ext_replay_set_iblocks(inode);
1638
1639	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1640	ext4_reset_inode_seed(inode);
1641
1642	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1643	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1644	sync_dirty_buffer(iloc.bh);
1645	brelse(iloc.bh);
1646out:
1647	iput(inode);
1648	if (!ret)
1649		blkdev_issue_flush(sb->s_bdev);
1650
1651	return 0;
1652}
1653
1654/*
1655 * Dentry create replay function.
1656 *
1657 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1658 * inode for which we are trying to create a dentry here, should already have
1659 * been replayed before we start here.
1660 */
1661static int ext4_fc_replay_create(struct super_block *sb,
1662				 struct ext4_fc_tl_mem *tl, u8 *val)
1663{
1664	int ret = 0;
1665	struct inode *inode = NULL;
1666	struct inode *dir = NULL;
1667	struct dentry_info_args darg;
1668
1669	tl_to_darg(&darg, tl, val);
1670
1671	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1672			darg.parent_ino, darg.dname_len);
1673
1674	/* This takes care of update group descriptor and other metadata */
1675	ret = ext4_mark_inode_used(sb, darg.ino);
1676	if (ret)
1677		goto out;
1678
1679	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1680	if (IS_ERR(inode)) {
1681		ext4_debug("inode %d not found.", darg.ino);
1682		inode = NULL;
1683		ret = -EINVAL;
1684		goto out;
1685	}
1686
1687	if (S_ISDIR(inode->i_mode)) {
1688		/*
1689		 * If we are creating a directory, we need to make sure that the
1690		 * dot and dot dot dirents are setup properly.
1691		 */
1692		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1693		if (IS_ERR(dir)) {
1694			ext4_debug("Dir %d not found.", darg.ino);
1695			goto out;
1696		}
1697		ret = ext4_init_new_dir(NULL, dir, inode);
1698		iput(dir);
1699		if (ret) {
1700			ret = 0;
1701			goto out;
1702		}
1703	}
1704	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1705	if (ret)
1706		goto out;
1707	set_nlink(inode, 1);
1708	ext4_mark_inode_dirty(NULL, inode);
1709out:
1710	iput(inode);
1711	return ret;
1712}
1713
1714/*
1715 * Record physical disk regions which are in use as per fast commit area,
1716 * and used by inodes during replay phase. Our simple replay phase
1717 * allocator excludes these regions from allocation.
1718 */
1719int ext4_fc_record_regions(struct super_block *sb, int ino,
1720		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1721{
1722	struct ext4_fc_replay_state *state;
1723	struct ext4_fc_alloc_region *region;
1724
1725	state = &EXT4_SB(sb)->s_fc_replay_state;
1726	/*
1727	 * during replay phase, the fc_regions_valid may not same as
1728	 * fc_regions_used, update it when do new additions.
1729	 */
1730	if (replay && state->fc_regions_used != state->fc_regions_valid)
1731		state->fc_regions_used = state->fc_regions_valid;
1732	if (state->fc_regions_used == state->fc_regions_size) {
1733		struct ext4_fc_alloc_region *fc_regions;
1734
1735		fc_regions = krealloc(state->fc_regions,
1736				      sizeof(struct ext4_fc_alloc_region) *
1737				      (state->fc_regions_size +
1738				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1739				      GFP_KERNEL);
1740		if (!fc_regions)
1741			return -ENOMEM;
1742		state->fc_regions_size +=
1743			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1744		state->fc_regions = fc_regions;
1745	}
1746	region = &state->fc_regions[state->fc_regions_used++];
1747	region->ino = ino;
1748	region->lblk = lblk;
1749	region->pblk = pblk;
1750	region->len = len;
1751
1752	if (replay)
1753		state->fc_regions_valid++;
1754
1755	return 0;
1756}
1757
1758/* Replay add range tag */
1759static int ext4_fc_replay_add_range(struct super_block *sb,
1760				    struct ext4_fc_tl_mem *tl, u8 *val)
1761{
1762	struct ext4_fc_add_range fc_add_ex;
1763	struct ext4_extent newex, *ex;
1764	struct inode *inode;
1765	ext4_lblk_t start, cur;
1766	int remaining, len;
1767	ext4_fsblk_t start_pblk;
1768	struct ext4_map_blocks map;
1769	struct ext4_ext_path *path = NULL;
1770	int ret;
1771
1772	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1773	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1774
1775	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1776		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1777		ext4_ext_get_actual_len(ex));
1778
1779	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1780	if (IS_ERR(inode)) {
1781		ext4_debug("Inode not found.");
1782		return 0;
1783	}
1784
1785	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1786	if (ret)
1787		goto out;
1788
1789	start = le32_to_cpu(ex->ee_block);
1790	start_pblk = ext4_ext_pblock(ex);
1791	len = ext4_ext_get_actual_len(ex);
1792
1793	cur = start;
1794	remaining = len;
1795	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1796		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1797		  inode->i_ino);
1798
1799	while (remaining > 0) {
1800		map.m_lblk = cur;
1801		map.m_len = remaining;
1802		map.m_pblk = 0;
1803		ret = ext4_map_blocks(NULL, inode, &map, 0);
1804
1805		if (ret < 0)
1806			goto out;
1807
1808		if (ret == 0) {
1809			/* Range is not mapped */
1810			path = ext4_find_extent(inode, cur, path, 0);
1811			if (IS_ERR(path))
1812				goto out;
1813			memset(&newex, 0, sizeof(newex));
1814			newex.ee_block = cpu_to_le32(cur);
1815			ext4_ext_store_pblock(
1816				&newex, start_pblk + cur - start);
1817			newex.ee_len = cpu_to_le16(map.m_len);
1818			if (ext4_ext_is_unwritten(ex))
1819				ext4_ext_mark_unwritten(&newex);
1820			down_write(&EXT4_I(inode)->i_data_sem);
1821			path = ext4_ext_insert_extent(NULL, inode,
1822						      path, &newex, 0);
1823			up_write((&EXT4_I(inode)->i_data_sem));
1824			if (IS_ERR(path))
1825				goto out;
1826			goto next;
1827		}
1828
1829		if (start_pblk + cur - start != map.m_pblk) {
1830			/*
1831			 * Logical to physical mapping changed. This can happen
1832			 * if this range was removed and then reallocated to
1833			 * map to new physical blocks during a fast commit.
1834			 */
1835			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1836					ext4_ext_is_unwritten(ex),
1837					start_pblk + cur - start);
1838			if (ret)
1839				goto out;
1840			/*
1841			 * Mark the old blocks as free since they aren't used
1842			 * anymore. We maintain an array of all the modified
1843			 * inodes. In case these blocks are still used at either
1844			 * a different logical range in the same inode or in
1845			 * some different inode, we will mark them as allocated
1846			 * at the end of the FC replay using our array of
1847			 * modified inodes.
1848			 */
1849			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1850			goto next;
1851		}
1852
1853		/* Range is mapped and needs a state change */
1854		ext4_debug("Converting from %ld to %d %lld",
1855				map.m_flags & EXT4_MAP_UNWRITTEN,
1856			ext4_ext_is_unwritten(ex), map.m_pblk);
1857		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1858					ext4_ext_is_unwritten(ex), map.m_pblk);
1859		if (ret)
1860			goto out;
1861		/*
1862		 * We may have split the extent tree while toggling the state.
1863		 * Try to shrink the extent tree now.
1864		 */
1865		ext4_ext_replay_shrink_inode(inode, start + len);
1866next:
1867		cur += map.m_len;
1868		remaining -= map.m_len;
1869	}
1870	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1871					sb->s_blocksize_bits);
1872out:
1873	ext4_free_ext_path(path);
1874	iput(inode);
1875	return 0;
1876}
1877
1878/* Replay DEL_RANGE tag */
1879static int
1880ext4_fc_replay_del_range(struct super_block *sb,
1881			 struct ext4_fc_tl_mem *tl, u8 *val)
1882{
1883	struct inode *inode;
1884	struct ext4_fc_del_range lrange;
1885	struct ext4_map_blocks map;
1886	ext4_lblk_t cur, remaining;
1887	int ret;
1888
1889	memcpy(&lrange, val, sizeof(lrange));
1890	cur = le32_to_cpu(lrange.fc_lblk);
1891	remaining = le32_to_cpu(lrange.fc_len);
1892
1893	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1894		le32_to_cpu(lrange.fc_ino), cur, remaining);
1895
1896	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1897	if (IS_ERR(inode)) {
1898		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1899		return 0;
1900	}
1901
1902	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1903	if (ret)
1904		goto out;
1905
1906	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1907			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1908			le32_to_cpu(lrange.fc_len));
1909	while (remaining > 0) {
1910		map.m_lblk = cur;
1911		map.m_len = remaining;
1912
1913		ret = ext4_map_blocks(NULL, inode, &map, 0);
1914		if (ret < 0)
1915			goto out;
1916		if (ret > 0) {
1917			remaining -= ret;
1918			cur += ret;
1919			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1920		} else {
1921			remaining -= map.m_len;
1922			cur += map.m_len;
1923		}
1924	}
1925
1926	down_write(&EXT4_I(inode)->i_data_sem);
1927	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1928				le32_to_cpu(lrange.fc_lblk) +
1929				le32_to_cpu(lrange.fc_len) - 1);
1930	up_write(&EXT4_I(inode)->i_data_sem);
1931	if (ret)
1932		goto out;
1933	ext4_ext_replay_shrink_inode(inode,
1934		i_size_read(inode) >> sb->s_blocksize_bits);
1935	ext4_mark_inode_dirty(NULL, inode);
1936out:
1937	iput(inode);
1938	return 0;
1939}
1940
1941static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1942{
1943	struct ext4_fc_replay_state *state;
1944	struct inode *inode;
1945	struct ext4_ext_path *path = NULL;
1946	struct ext4_map_blocks map;
1947	int i, ret, j;
1948	ext4_lblk_t cur, end;
1949
1950	state = &EXT4_SB(sb)->s_fc_replay_state;
1951	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1952		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1953			EXT4_IGET_NORMAL);
1954		if (IS_ERR(inode)) {
1955			ext4_debug("Inode %d not found.",
1956				state->fc_modified_inodes[i]);
1957			continue;
1958		}
1959		cur = 0;
1960		end = EXT_MAX_BLOCKS;
1961		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1962			iput(inode);
1963			continue;
1964		}
1965		while (cur < end) {
1966			map.m_lblk = cur;
1967			map.m_len = end - cur;
1968
1969			ret = ext4_map_blocks(NULL, inode, &map, 0);
1970			if (ret < 0)
1971				break;
1972
1973			if (ret > 0) {
1974				path = ext4_find_extent(inode, map.m_lblk, path, 0);
1975				if (!IS_ERR(path)) {
1976					for (j = 0; j < path->p_depth; j++)
1977						ext4_mb_mark_bb(inode->i_sb,
1978							path[j].p_block, 1, true);
1979				} else {
1980					path = NULL;
1981				}
1982				cur += ret;
1983				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1984							map.m_len, true);
1985			} else {
1986				cur = cur + (map.m_len ? map.m_len : 1);
1987			}
1988		}
1989		iput(inode);
1990	}
1991
1992	ext4_free_ext_path(path);
1993}
1994
1995/*
1996 * Check if block is in excluded regions for block allocation. The simple
1997 * allocator that runs during replay phase is calls this function to see
1998 * if it is okay to use a block.
1999 */
2000bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
2001{
2002	int i;
2003	struct ext4_fc_replay_state *state;
2004
2005	state = &EXT4_SB(sb)->s_fc_replay_state;
2006	for (i = 0; i < state->fc_regions_valid; i++) {
2007		if (state->fc_regions[i].ino == 0 ||
2008			state->fc_regions[i].len == 0)
2009			continue;
2010		if (in_range(blk, state->fc_regions[i].pblk,
2011					state->fc_regions[i].len))
2012			return true;
2013	}
2014	return false;
2015}
2016
2017/* Cleanup function called after replay */
2018void ext4_fc_replay_cleanup(struct super_block *sb)
2019{
2020	struct ext4_sb_info *sbi = EXT4_SB(sb);
2021
2022	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2023	kfree(sbi->s_fc_replay_state.fc_regions);
2024	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2025}
2026
2027static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2028				      int tag, int len)
2029{
2030	switch (tag) {
2031	case EXT4_FC_TAG_ADD_RANGE:
2032		return len == sizeof(struct ext4_fc_add_range);
2033	case EXT4_FC_TAG_DEL_RANGE:
2034		return len == sizeof(struct ext4_fc_del_range);
2035	case EXT4_FC_TAG_CREAT:
2036	case EXT4_FC_TAG_LINK:
2037	case EXT4_FC_TAG_UNLINK:
2038		len -= sizeof(struct ext4_fc_dentry_info);
2039		return len >= 1 && len <= EXT4_NAME_LEN;
2040	case EXT4_FC_TAG_INODE:
2041		len -= sizeof(struct ext4_fc_inode);
2042		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2043			len <= sbi->s_inode_size;
2044	case EXT4_FC_TAG_PAD:
2045		return true; /* padding can have any length */
2046	case EXT4_FC_TAG_TAIL:
2047		return len >= sizeof(struct ext4_fc_tail);
2048	case EXT4_FC_TAG_HEAD:
2049		return len == sizeof(struct ext4_fc_head);
2050	}
2051	return false;
2052}
2053
2054/*
2055 * Recovery Scan phase handler
2056 *
2057 * This function is called during the scan phase and is responsible
2058 * for doing following things:
2059 * - Make sure the fast commit area has valid tags for replay
2060 * - Count number of tags that need to be replayed by the replay handler
2061 * - Verify CRC
2062 * - Create a list of excluded blocks for allocation during replay phase
2063 *
2064 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2065 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2066 * to indicate that scan has finished and JBD2 can now start replay phase.
2067 * It returns a negative error to indicate that there was an error. At the end
2068 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2069 * to indicate the number of tags that need to replayed during the replay phase.
2070 */
2071static int ext4_fc_replay_scan(journal_t *journal,
2072				struct buffer_head *bh, int off,
2073				tid_t expected_tid)
2074{
2075	struct super_block *sb = journal->j_private;
2076	struct ext4_sb_info *sbi = EXT4_SB(sb);
2077	struct ext4_fc_replay_state *state;
2078	int ret = JBD2_FC_REPLAY_CONTINUE;
2079	struct ext4_fc_add_range ext;
2080	struct ext4_fc_tl_mem tl;
2081	struct ext4_fc_tail tail;
2082	__u8 *start, *end, *cur, *val;
2083	struct ext4_fc_head head;
2084	struct ext4_extent *ex;
2085
2086	state = &sbi->s_fc_replay_state;
2087
2088	start = (u8 *)bh->b_data;
2089	end = start + journal->j_blocksize;
2090
2091	if (state->fc_replay_expected_off == 0) {
2092		state->fc_cur_tag = 0;
2093		state->fc_replay_num_tags = 0;
2094		state->fc_crc = 0;
2095		state->fc_regions = NULL;
2096		state->fc_regions_valid = state->fc_regions_used =
2097			state->fc_regions_size = 0;
2098		/* Check if we can stop early */
2099		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2100			!= EXT4_FC_TAG_HEAD)
2101			return 0;
2102	}
2103
2104	if (off != state->fc_replay_expected_off) {
2105		ret = -EFSCORRUPTED;
2106		goto out_err;
2107	}
2108
2109	state->fc_replay_expected_off++;
2110	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2111	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2112		ext4_fc_get_tl(&tl, cur);
2113		val = cur + EXT4_FC_TAG_BASE_LEN;
2114		if (tl.fc_len > end - val ||
2115		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2116			ret = state->fc_replay_num_tags ?
2117				JBD2_FC_REPLAY_STOP : -ECANCELED;
2118			goto out_err;
2119		}
2120		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2121			   tag2str(tl.fc_tag), bh->b_blocknr);
2122		switch (tl.fc_tag) {
2123		case EXT4_FC_TAG_ADD_RANGE:
2124			memcpy(&ext, val, sizeof(ext));
2125			ex = (struct ext4_extent *)&ext.fc_ex;
2126			ret = ext4_fc_record_regions(sb,
2127				le32_to_cpu(ext.fc_ino),
2128				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2129				ext4_ext_get_actual_len(ex), 0);
2130			if (ret < 0)
2131				break;
2132			ret = JBD2_FC_REPLAY_CONTINUE;
2133			fallthrough;
2134		case EXT4_FC_TAG_DEL_RANGE:
2135		case EXT4_FC_TAG_LINK:
2136		case EXT4_FC_TAG_UNLINK:
2137		case EXT4_FC_TAG_CREAT:
2138		case EXT4_FC_TAG_INODE:
2139		case EXT4_FC_TAG_PAD:
2140			state->fc_cur_tag++;
2141			state->fc_crc = ext4_chksum(state->fc_crc, cur,
2142				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2143			break;
2144		case EXT4_FC_TAG_TAIL:
2145			state->fc_cur_tag++;
2146			memcpy(&tail, val, sizeof(tail));
2147			state->fc_crc = ext4_chksum(state->fc_crc, cur,
2148						EXT4_FC_TAG_BASE_LEN +
2149						offsetof(struct ext4_fc_tail,
2150						fc_crc));
2151			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2152				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2153				state->fc_replay_num_tags = state->fc_cur_tag;
2154				state->fc_regions_valid =
2155					state->fc_regions_used;
2156			} else {
2157				ret = state->fc_replay_num_tags ?
2158					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2159			}
2160			state->fc_crc = 0;
2161			break;
2162		case EXT4_FC_TAG_HEAD:
2163			memcpy(&head, val, sizeof(head));
2164			if (le32_to_cpu(head.fc_features) &
2165				~EXT4_FC_SUPPORTED_FEATURES) {
2166				ret = -EOPNOTSUPP;
2167				break;
2168			}
2169			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2170				ret = JBD2_FC_REPLAY_STOP;
2171				break;
2172			}
2173			state->fc_cur_tag++;
2174			state->fc_crc = ext4_chksum(state->fc_crc, cur,
2175				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2176			break;
2177		default:
2178			ret = state->fc_replay_num_tags ?
2179				JBD2_FC_REPLAY_STOP : -ECANCELED;
2180		}
2181		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2182			break;
2183	}
2184
2185out_err:
2186	trace_ext4_fc_replay_scan(sb, ret, off);
2187	return ret;
2188}
2189
2190/*
2191 * Main recovery path entry point.
2192 * The meaning of return codes is similar as above.
2193 */
2194static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2195				enum passtype pass, int off, tid_t expected_tid)
2196{
2197	struct super_block *sb = journal->j_private;
2198	struct ext4_sb_info *sbi = EXT4_SB(sb);
2199	struct ext4_fc_tl_mem tl;
2200	__u8 *start, *end, *cur, *val;
2201	int ret = JBD2_FC_REPLAY_CONTINUE;
2202	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2203	struct ext4_fc_tail tail;
2204
2205	if (pass == PASS_SCAN) {
2206		state->fc_current_pass = PASS_SCAN;
2207		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2208	}
2209
2210	if (state->fc_current_pass != pass) {
2211		state->fc_current_pass = pass;
2212		sbi->s_mount_state |= EXT4_FC_REPLAY;
2213	}
2214	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2215		ext4_debug("Replay stops\n");
2216		ext4_fc_set_bitmaps_and_counters(sb);
2217		return 0;
2218	}
2219
2220#ifdef CONFIG_EXT4_DEBUG
2221	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2222		pr_warn("Dropping fc block %d because max_replay set\n", off);
2223		return JBD2_FC_REPLAY_STOP;
2224	}
2225#endif
2226
2227	start = (u8 *)bh->b_data;
2228	end = start + journal->j_blocksize;
2229
2230	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2231	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2232		ext4_fc_get_tl(&tl, cur);
2233		val = cur + EXT4_FC_TAG_BASE_LEN;
2234
2235		if (state->fc_replay_num_tags == 0) {
2236			ret = JBD2_FC_REPLAY_STOP;
2237			ext4_fc_set_bitmaps_and_counters(sb);
2238			break;
2239		}
2240
2241		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2242		state->fc_replay_num_tags--;
2243		switch (tl.fc_tag) {
2244		case EXT4_FC_TAG_LINK:
2245			ret = ext4_fc_replay_link(sb, &tl, val);
2246			break;
2247		case EXT4_FC_TAG_UNLINK:
2248			ret = ext4_fc_replay_unlink(sb, &tl, val);
2249			break;
2250		case EXT4_FC_TAG_ADD_RANGE:
2251			ret = ext4_fc_replay_add_range(sb, &tl, val);
2252			break;
2253		case EXT4_FC_TAG_CREAT:
2254			ret = ext4_fc_replay_create(sb, &tl, val);
2255			break;
2256		case EXT4_FC_TAG_DEL_RANGE:
2257			ret = ext4_fc_replay_del_range(sb, &tl, val);
2258			break;
2259		case EXT4_FC_TAG_INODE:
2260			ret = ext4_fc_replay_inode(sb, &tl, val);
2261			break;
2262		case EXT4_FC_TAG_PAD:
2263			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2264					     tl.fc_len, 0);
2265			break;
2266		case EXT4_FC_TAG_TAIL:
2267			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2268					     0, tl.fc_len, 0);
2269			memcpy(&tail, val, sizeof(tail));
2270			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2271			break;
2272		case EXT4_FC_TAG_HEAD:
2273			break;
2274		default:
2275			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2276			ret = -ECANCELED;
2277			break;
2278		}
2279		if (ret < 0)
2280			break;
2281		ret = JBD2_FC_REPLAY_CONTINUE;
2282	}
2283	return ret;
2284}
2285
2286void ext4_fc_init(struct super_block *sb, journal_t *journal)
2287{
2288	/*
2289	 * We set replay callback even if fast commit disabled because we may
2290	 * could still have fast commit blocks that need to be replayed even if
2291	 * fast commit has now been turned off.
2292	 */
2293	journal->j_fc_replay_callback = ext4_fc_replay;
2294	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2295		return;
2296	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2297}
2298
2299static const char * const fc_ineligible_reasons[] = {
2300	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2301	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2302	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2303	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2304	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2305	[EXT4_FC_REASON_RESIZE] = "Resize",
2306	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2307	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2308	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2309	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2310	[EXT4_FC_REASON_MIGRATE] = "Inode format migration",
2311	[EXT4_FC_REASON_VERITY] = "fs-verity enable",
2312	[EXT4_FC_REASON_MOVE_EXT] = "Move extents",
2313};
2314
2315int ext4_fc_info_show(struct seq_file *seq, void *v)
2316{
2317	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2318	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2319	int i;
2320
2321	if (v != SEQ_START_TOKEN)
2322		return 0;
2323
2324	seq_printf(seq,
2325		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2326		   stats->fc_num_commits, stats->fc_ineligible_commits,
2327		   stats->fc_numblks,
2328		   div_u64(stats->s_fc_avg_commit_time, 1000));
2329	seq_puts(seq, "Ineligible reasons:\n");
2330	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2331		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2332			stats->fc_ineligible_reason_count[i]);
2333
2334	return 0;
2335}
2336
2337int __init ext4_fc_init_dentry_cache(void)
2338{
2339	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2340					   SLAB_RECLAIM_ACCOUNT);
2341
2342	if (ext4_fc_dentry_cachep == NULL)
2343		return -ENOMEM;
2344
2345	return 0;
2346}
2347
2348void ext4_fc_destroy_dentry_cache(void)
2349{
2350	kmem_cache_destroy(ext4_fc_dentry_cachep);
2351}
Configure Feed

Configure Feed