Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10#include "ext4.h"
11#include "ext4_jbd2.h"
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15#include <linux/lockdep.h>
16/*
17 * Ext4 Fast Commits
18 * -----------------
19 *
20 * Ext4 fast commits implement fine grained journalling for Ext4.
21 *
22 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
23 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
24 * TLV during the recovery phase. For the scenarios for which we currently
25 * don't have replay code, fast commit falls back to full commits.
26 * Fast commits record delta in one of the following three categories.
27 *
28 * (A) Directory entry updates:
29 *
30 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
31 * - EXT4_FC_TAG_LINK - records directory entry link
32 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
33 *
34 * (B) File specific data range updates:
35 *
36 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
37 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
38 *
39 * (C) Inode metadata (mtime / ctime etc):
40 *
41 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
42 * during recovery. Note that iblocks field is
43 * not replayed and instead derived during
44 * replay.
45 * Commit Operation
46 * ----------------
47 * With fast commits, we maintain all the directory entry operations in the
48 * order in which they are issued in an in-memory queue. This queue is flushed
49 * to disk during the commit operation. We also maintain a list of inodes
50 * that need to be committed during a fast commit in another in memory queue of
51 * inodes. During the commit operation, we commit in the following order:
52 *
53 * [1] Prepare all the inodes to write out their data by setting
54 * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
55 * deleted while it is being flushed.
56 * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
57 * state.
58 * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
59 * all the exsiting handles finish and no new handles can start.
60 * [4] Mark all the fast commit eligible inodes as undergoing fast commit
61 * by setting "EXT4_STATE_FC_COMMITTING" state.
62 * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
63 * starting of new handles. If new handles try to start an update on
64 * any of the inodes that are being committed, ext4_fc_track_inode()
65 * will block until those inodes have finished the fast commit.
66 * [6] Commit all the directory entry updates in the fast commit space.
67 * [7] Commit all the changed inodes in the fast commit space and clear
68 * "EXT4_STATE_FC_COMMITTING" for these inodes.
69 * [8] Write tail tag (this tag ensures the atomicity, please read the following
70 * section for more details).
71 *
72 * All the inode updates must be enclosed within jbd2_jounrnal_start()
73 * and jbd2_journal_stop() similar to JBD2 journaling.
74 *
75 * Fast Commit Ineligibility
76 * -------------------------
77 *
78 * Not all operations are supported by fast commits today (e.g extended
79 * attributes). Fast commit ineligibility is marked by calling
80 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
81 * to full commit.
82 *
83 * Atomicity of commits
84 * --------------------
85 * In order to guarantee atomicity during the commit operation, fast commit
86 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
87 * tag contains CRC of the contents and TID of the transaction after which
88 * this fast commit should be applied. Recovery code replays fast commit
89 * logs only if there's at least 1 valid tail present. For every fast commit
90 * operation, there is 1 tail. This means, we may end up with multiple tails
91 * in the fast commit space. Here's an example:
92 *
93 * - Create a new file A and remove existing file B
94 * - fsync()
95 * - Append contents to file A
96 * - Truncate file A
97 * - fsync()
98 *
99 * The fast commit space at the end of above operations would look like this:
100 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
101 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
102 *
103 * Replay code should thus check for all the valid tails in the FC area.
104 *
105 * Fast Commit Replay Idempotence
106 * ------------------------------
107 *
108 * Fast commits tags are idempotent in nature provided the recovery code follows
109 * certain rules. The guiding principle that the commit path follows while
110 * committing is that it stores the result of a particular operation instead of
111 * storing the procedure.
112 *
113 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
114 * was associated with inode 10. During fast commit, instead of storing this
115 * operation as a procedure "rename a to b", we store the resulting file system
116 * state as a "series" of outcomes:
117 *
118 * - Link dirent b to inode 10
119 * - Unlink dirent a
120 * - Inode <10> with valid refcount
121 *
122 * Now when recovery code runs, it needs "enforce" this state on the file
123 * system. This is what guarantees idempotence of fast commit replay.
124 *
125 * Let's take an example of a procedure that is not idempotent and see how fast
126 * commits make it idempotent. Consider following sequence of operations:
127 *
128 * rm A; mv B A; read A
129 * (x) (y) (z)
130 *
131 * (x), (y) and (z) are the points at which we can crash. If we store this
132 * sequence of operations as is then the replay is not idempotent. Let's say
133 * while in replay, we crash at (z). During the second replay, file A (which was
134 * actually created as a result of "mv B A" operation) would get deleted. Thus,
135 * file named A would be absent when we try to read A. So, this sequence of
136 * operations is not idempotent. However, as mentioned above, instead of storing
137 * the procedure fast commits store the outcome of each procedure. Thus the fast
138 * commit log for above procedure would be as follows:
139 *
140 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
141 * inode 11 before the replay)
142 *
143 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
144 * (w) (x) (y) (z)
145 *
146 * If we crash at (z), we will have file A linked to inode 11. During the second
147 * replay, we will remove file A (inode 11). But we will create it back and make
148 * it point to inode 11. We won't find B, so we'll just skip that step. At this
149 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
150 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
151 * similarly. Thus, by converting a non-idempotent procedure into a series of
152 * idempotent outcomes, fast commits ensured idempotence during the replay.
153 *
154 * Locking
155 * -------
156 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
157 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
158 * inode. Most of the code avoids acquiring both the locks, but if one must do
159 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
160 *
161 * TODOs
162 * -----
163 *
164 * 0) Fast commit replay path hardening: Fast commit replay code should use
165 * journal handles to make sure all the updates it does during the replay
166 * path are atomic. With that if we crash during fast commit replay, after
167 * trying to do recovery again, we will find a file system where fast commit
168 * area is invalid (because new full commit would be found). In order to deal
169 * with that, fast commit replay code should ensure that the "FC_REPLAY"
170 * superblock state is persisted before starting the replay, so that after
171 * the crash, fast commit recovery code can look at that flag and perform
172 * fast commit recovery even if that area is invalidated by later full
173 * commits.
174 *
175 * 1) Handle more ineligible cases.
176 *
177 * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
178 * status tree. This would get rid of the need to call ext4_fc_track_inode()
179 * before acquiring i_data_sem. To do that we would need to ensure that
180 * modified extents from the extent status tree are not evicted from memory.
181 */
182
183#include <trace/events/ext4.h>
184static struct kmem_cache *ext4_fc_dentry_cachep;
185
186static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
187{
188 BUFFER_TRACE(bh, "");
189 if (uptodate) {
190 ext4_debug("%s: Block %lld up-to-date",
191 __func__, bh->b_blocknr);
192 set_buffer_uptodate(bh);
193 } else {
194 ext4_debug("%s: Block %lld not up-to-date",
195 __func__, bh->b_blocknr);
196 clear_buffer_uptodate(bh);
197 }
198
199 unlock_buffer(bh);
200}
201
202static inline void ext4_fc_reset_inode(struct inode *inode)
203{
204 struct ext4_inode_info *ei = EXT4_I(inode);
205
206 ei->i_fc_lblk_start = 0;
207 ei->i_fc_lblk_len = 0;
208}
209
210void ext4_fc_init_inode(struct inode *inode)
211{
212 struct ext4_inode_info *ei = EXT4_I(inode);
213
214 ext4_fc_reset_inode(inode);
215 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
216 INIT_LIST_HEAD(&ei->i_fc_list);
217 INIT_LIST_HEAD(&ei->i_fc_dilist);
218 init_waitqueue_head(&ei->i_fc_wait);
219}
220
221static bool ext4_fc_disabled(struct super_block *sb)
222{
223 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
224 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
225}
226
227/*
228 * Remove inode from fast commit list. If the inode is being committed
229 * we wait until inode commit is done.
230 */
231void ext4_fc_del(struct inode *inode)
232{
233 struct ext4_inode_info *ei = EXT4_I(inode);
234 struct ext4_fc_dentry_update *fc_dentry;
235 wait_queue_head_t *wq;
236 int alloc_ctx;
237
238 if (ext4_fc_disabled(inode->i_sb))
239 return;
240
241 alloc_ctx = ext4_fc_lock(inode->i_sb);
242 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
243 ext4_fc_unlock(inode->i_sb, alloc_ctx);
244 return;
245 }
246
247 /*
248 * Since ext4_fc_del is called from ext4_evict_inode while having a
249 * handle open, there is no need for us to wait here even if a fast
250 * commit is going on. That is because, if this inode is being
251 * committed, ext4_mark_inode_dirty would have waited for inode commit
252 * operation to finish before we come here. So, by the time we come
253 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
254 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
255 * here.
256 *
257 * We may come here without any handles open in the "no_delete" case of
258 * ext4_evict_inode as well. However, if that happens, we first mark the
259 * file system as fast commit ineligible anyway. So, even in that case,
260 * it is okay to remove the inode from the fc list.
261 */
262 WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
263 && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
264 while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
265#if (BITS_PER_LONG < 64)
266 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
267 EXT4_STATE_FC_FLUSHING_DATA);
268 wq = bit_waitqueue(&ei->i_state_flags,
269 EXT4_STATE_FC_FLUSHING_DATA);
270#else
271 DEFINE_WAIT_BIT(wait, &ei->i_flags,
272 EXT4_STATE_FC_FLUSHING_DATA);
273 wq = bit_waitqueue(&ei->i_flags,
274 EXT4_STATE_FC_FLUSHING_DATA);
275#endif
276 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
277 if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
278 ext4_fc_unlock(inode->i_sb, alloc_ctx);
279 schedule();
280 alloc_ctx = ext4_fc_lock(inode->i_sb);
281 }
282 finish_wait(wq, &wait.wq_entry);
283 }
284 list_del_init(&ei->i_fc_list);
285
286 /*
287 * Since this inode is getting removed, let's also remove all FC
288 * dentry create references, since it is not needed to log it anyways.
289 */
290 if (list_empty(&ei->i_fc_dilist)) {
291 ext4_fc_unlock(inode->i_sb, alloc_ctx);
292 return;
293 }
294
295 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
296 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
297 list_del_init(&fc_dentry->fcd_list);
298 list_del_init(&fc_dentry->fcd_dilist);
299
300 WARN_ON(!list_empty(&ei->i_fc_dilist));
301 ext4_fc_unlock(inode->i_sb, alloc_ctx);
302
303 release_dentry_name_snapshot(&fc_dentry->fcd_name);
304 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
305}
306
307/*
308 * Mark file system as fast commit ineligible, and record latest
309 * ineligible transaction tid. This means until the recorded
310 * transaction, commit operation would result in a full jbd2 commit.
311 */
312void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
313{
314 struct ext4_sb_info *sbi = EXT4_SB(sb);
315 tid_t tid;
316 bool has_transaction = true;
317 bool is_ineligible;
318 int alloc_ctx;
319
320 if (ext4_fc_disabled(sb))
321 return;
322
323 if (handle && !IS_ERR(handle))
324 tid = handle->h_transaction->t_tid;
325 else {
326 read_lock(&sbi->s_journal->j_state_lock);
327 if (sbi->s_journal->j_running_transaction)
328 tid = sbi->s_journal->j_running_transaction->t_tid;
329 else
330 has_transaction = false;
331 read_unlock(&sbi->s_journal->j_state_lock);
332 }
333 alloc_ctx = ext4_fc_lock(sb);
334 is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
335 if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
336 sbi->s_fc_ineligible_tid = tid;
337 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
338 ext4_fc_unlock(sb, alloc_ctx);
339 WARN_ON(reason >= EXT4_FC_REASON_MAX);
340 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
341}
342
343/*
344 * Generic fast commit tracking function. If this is the first time this we are
345 * called after a full commit, we initialize fast commit fields and then call
346 * __fc_track_fn() with update = 0. If we have already been called after a full
347 * commit, we pass update = 1. Based on that, the track function can determine
348 * if it needs to track a field for the first time or if it needs to just
349 * update the previously tracked value.
350 *
351 * If enqueue is set, this function enqueues the inode in fast commit list.
352 */
353static int ext4_fc_track_template(
354 handle_t *handle, struct inode *inode,
355 int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
356 void *args, int enqueue)
357{
358 bool update = false;
359 struct ext4_inode_info *ei = EXT4_I(inode);
360 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
361 tid_t tid = 0;
362 int alloc_ctx;
363 int ret;
364
365 tid = handle->h_transaction->t_tid;
366 spin_lock(&ei->i_fc_lock);
367 if (tid == ei->i_sync_tid) {
368 update = true;
369 } else {
370 ext4_fc_reset_inode(inode);
371 ei->i_sync_tid = tid;
372 }
373 ret = __fc_track_fn(handle, inode, args, update);
374 spin_unlock(&ei->i_fc_lock);
375 if (!enqueue)
376 return ret;
377
378 alloc_ctx = ext4_fc_lock(inode->i_sb);
379 if (list_empty(&EXT4_I(inode)->i_fc_list))
380 list_add_tail(&EXT4_I(inode)->i_fc_list,
381 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
382 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
383 &sbi->s_fc_q[FC_Q_STAGING] :
384 &sbi->s_fc_q[FC_Q_MAIN]);
385 ext4_fc_unlock(inode->i_sb, alloc_ctx);
386
387 return ret;
388}
389
390struct __track_dentry_update_args {
391 struct dentry *dentry;
392 int op;
393};
394
395/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
396static int __track_dentry_update(handle_t *handle, struct inode *inode,
397 void *arg, bool update)
398{
399 struct ext4_fc_dentry_update *node;
400 struct ext4_inode_info *ei = EXT4_I(inode);
401 struct __track_dentry_update_args *dentry_update =
402 (struct __track_dentry_update_args *)arg;
403 struct dentry *dentry = dentry_update->dentry;
404 struct inode *dir = dentry->d_parent->d_inode;
405 struct super_block *sb = inode->i_sb;
406 struct ext4_sb_info *sbi = EXT4_SB(sb);
407 int alloc_ctx;
408
409 spin_unlock(&ei->i_fc_lock);
410
411 if (IS_ENCRYPTED(dir)) {
412 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
413 handle);
414 spin_lock(&ei->i_fc_lock);
415 return -EOPNOTSUPP;
416 }
417
418 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
419 if (!node) {
420 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
421 spin_lock(&ei->i_fc_lock);
422 return -ENOMEM;
423 }
424
425 node->fcd_op = dentry_update->op;
426 node->fcd_parent = dir->i_ino;
427 node->fcd_ino = inode->i_ino;
428 take_dentry_name_snapshot(&node->fcd_name, dentry);
429 INIT_LIST_HEAD(&node->fcd_dilist);
430 INIT_LIST_HEAD(&node->fcd_list);
431 alloc_ctx = ext4_fc_lock(sb);
432 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
433 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
434 list_add_tail(&node->fcd_list,
435 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
436 else
437 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
438
439 /*
440 * This helps us keep a track of all fc_dentry updates which is part of
441 * this ext4 inode. So in case the inode is getting unlinked, before
442 * even we get a chance to fsync, we could remove all fc_dentry
443 * references while evicting the inode in ext4_fc_del().
444 * Also with this, we don't need to loop over all the inodes in
445 * sbi->s_fc_q to get the corresponding inode in
446 * ext4_fc_commit_dentry_updates().
447 */
448 if (dentry_update->op == EXT4_FC_TAG_CREAT) {
449 WARN_ON(!list_empty(&ei->i_fc_dilist));
450 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
451 }
452 ext4_fc_unlock(sb, alloc_ctx);
453 spin_lock(&ei->i_fc_lock);
454
455 return 0;
456}
457
458void __ext4_fc_track_unlink(handle_t *handle,
459 struct inode *inode, struct dentry *dentry)
460{
461 struct __track_dentry_update_args args;
462 int ret;
463
464 args.dentry = dentry;
465 args.op = EXT4_FC_TAG_UNLINK;
466
467 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
468 (void *)&args, 0);
469 trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
470}
471
472void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
473{
474 struct inode *inode = d_inode(dentry);
475
476 if (ext4_fc_disabled(inode->i_sb))
477 return;
478
479 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
480 return;
481
482 __ext4_fc_track_unlink(handle, inode, dentry);
483}
484
485void __ext4_fc_track_link(handle_t *handle,
486 struct inode *inode, struct dentry *dentry)
487{
488 struct __track_dentry_update_args args;
489 int ret;
490
491 args.dentry = dentry;
492 args.op = EXT4_FC_TAG_LINK;
493
494 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
495 (void *)&args, 0);
496 trace_ext4_fc_track_link(handle, inode, dentry, ret);
497}
498
499void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
500{
501 struct inode *inode = d_inode(dentry);
502
503 if (ext4_fc_disabled(inode->i_sb))
504 return;
505
506 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
507 return;
508
509 __ext4_fc_track_link(handle, inode, dentry);
510}
511
512void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
513 struct dentry *dentry)
514{
515 struct __track_dentry_update_args args;
516 int ret;
517
518 args.dentry = dentry;
519 args.op = EXT4_FC_TAG_CREAT;
520
521 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
522 (void *)&args, 0);
523 trace_ext4_fc_track_create(handle, inode, dentry, ret);
524}
525
526void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
527{
528 struct inode *inode = d_inode(dentry);
529
530 if (ext4_fc_disabled(inode->i_sb))
531 return;
532
533 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
534 return;
535
536 __ext4_fc_track_create(handle, inode, dentry);
537}
538
539/* __track_fn for inode tracking */
540static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
541 bool update)
542{
543 if (update)
544 return -EEXIST;
545
546 EXT4_I(inode)->i_fc_lblk_len = 0;
547
548 return 0;
549}
550
551void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
552{
553 struct ext4_inode_info *ei = EXT4_I(inode);
554 wait_queue_head_t *wq;
555 int ret;
556
557 if (S_ISDIR(inode->i_mode))
558 return;
559
560 if (ext4_fc_disabled(inode->i_sb))
561 return;
562
563 if (ext4_should_journal_data(inode)) {
564 ext4_fc_mark_ineligible(inode->i_sb,
565 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
566 return;
567 }
568
569 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
570 return;
571
572 /*
573 * If we come here, we may sleep while waiting for the inode to
574 * commit. We shouldn't be holding i_data_sem when we go to sleep since
575 * the commit path needs to grab the lock while committing the inode.
576 */
577 lockdep_assert_not_held(&ei->i_data_sem);
578
579 while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
580#if (BITS_PER_LONG < 64)
581 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
582 EXT4_STATE_FC_COMMITTING);
583 wq = bit_waitqueue(&ei->i_state_flags,
584 EXT4_STATE_FC_COMMITTING);
585#else
586 DEFINE_WAIT_BIT(wait, &ei->i_flags,
587 EXT4_STATE_FC_COMMITTING);
588 wq = bit_waitqueue(&ei->i_flags,
589 EXT4_STATE_FC_COMMITTING);
590#endif
591 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
592 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
593 schedule();
594 finish_wait(wq, &wait.wq_entry);
595 }
596
597 /*
598 * From this point on, this inode will not be committed either
599 * by fast or full commit as long as the handle is open.
600 */
601 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
602 trace_ext4_fc_track_inode(handle, inode, ret);
603}
604
605struct __track_range_args {
606 ext4_lblk_t start, end;
607};
608
609/* __track_fn for tracking data updates */
610static int __track_range(handle_t *handle, struct inode *inode, void *arg,
611 bool update)
612{
613 struct ext4_inode_info *ei = EXT4_I(inode);
614 ext4_lblk_t oldstart;
615 struct __track_range_args *__arg =
616 (struct __track_range_args *)arg;
617
618 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
619 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
620 return -ECANCELED;
621 }
622
623 oldstart = ei->i_fc_lblk_start;
624
625 if (update && ei->i_fc_lblk_len > 0) {
626 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
627 ei->i_fc_lblk_len =
628 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
629 ei->i_fc_lblk_start + 1;
630 } else {
631 ei->i_fc_lblk_start = __arg->start;
632 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
633 }
634
635 return 0;
636}
637
638void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
639 ext4_lblk_t end)
640{
641 struct __track_range_args args;
642 int ret;
643
644 if (S_ISDIR(inode->i_mode))
645 return;
646
647 if (ext4_fc_disabled(inode->i_sb))
648 return;
649
650 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
651 return;
652
653 if (ext4_has_inline_data(inode)) {
654 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
655 handle);
656 return;
657 }
658
659 args.start = start;
660 args.end = end;
661
662 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
663
664 trace_ext4_fc_track_range(handle, inode, start, end, ret);
665}
666
667static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
668{
669 blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
670 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
671
672 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
673 if (test_opt(sb, BARRIER) && is_tail)
674 write_flags |= REQ_FUA | REQ_PREFLUSH;
675 lock_buffer(bh);
676 set_buffer_dirty(bh);
677 set_buffer_uptodate(bh);
678 bh->b_end_io = ext4_end_buffer_io_sync;
679 submit_bh(REQ_OP_WRITE | write_flags, bh);
680 EXT4_SB(sb)->s_fc_bh = NULL;
681}
682
683/* Ext4 commit path routines */
684
685/*
686 * Allocate len bytes on a fast commit buffer.
687 *
688 * During the commit time this function is used to manage fast commit
689 * block space. We don't split a fast commit log onto different
690 * blocks. So this function makes sure that if there's not enough space
691 * on the current block, the remaining space in the current block is
692 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
693 * new block is from jbd2 and CRC is updated to reflect the padding
694 * we added.
695 */
696static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
697{
698 struct ext4_fc_tl tl;
699 struct ext4_sb_info *sbi = EXT4_SB(sb);
700 struct buffer_head *bh;
701 int bsize = sbi->s_journal->j_blocksize;
702 int ret, off = sbi->s_fc_bytes % bsize;
703 int remaining;
704 u8 *dst;
705
706 /*
707 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
708 * cannot fulfill the request.
709 */
710 if (len > bsize - EXT4_FC_TAG_BASE_LEN)
711 return NULL;
712
713 if (!sbi->s_fc_bh) {
714 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
715 if (ret)
716 return NULL;
717 sbi->s_fc_bh = bh;
718 }
719 dst = sbi->s_fc_bh->b_data + off;
720
721 /*
722 * Allocate the bytes in the current block if we can do so while still
723 * leaving enough space for a PAD tlv.
724 */
725 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
726 if (len <= remaining) {
727 sbi->s_fc_bytes += len;
728 return dst;
729 }
730
731 /*
732 * Else, terminate the current block with a PAD tlv, then allocate a new
733 * block and allocate the bytes at the start of that new block.
734 */
735
736 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
737 tl.fc_len = cpu_to_le16(remaining);
738 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
739 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
740 *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
741
742 ext4_fc_submit_bh(sb, false);
743
744 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
745 if (ret)
746 return NULL;
747 sbi->s_fc_bh = bh;
748 sbi->s_fc_bytes += bsize - off + len;
749 return sbi->s_fc_bh->b_data;
750}
751
752/*
753 * Complete a fast commit by writing tail tag.
754 *
755 * Writing tail tag marks the end of a fast commit. In order to guarantee
756 * atomicity, after writing tail tag, even if there's space remaining
757 * in the block, next commit shouldn't use it. That's why tail tag
758 * has the length as that of the remaining space on the block.
759 */
760static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
761{
762 struct ext4_sb_info *sbi = EXT4_SB(sb);
763 struct ext4_fc_tl tl;
764 struct ext4_fc_tail tail;
765 int off, bsize = sbi->s_journal->j_blocksize;
766 u8 *dst;
767
768 /*
769 * ext4_fc_reserve_space takes care of allocating an extra block if
770 * there's no enough space on this block for accommodating this tail.
771 */
772 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
773 if (!dst)
774 return -ENOSPC;
775
776 off = sbi->s_fc_bytes % bsize;
777
778 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
779 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
780 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
781
782 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
783 dst += EXT4_FC_TAG_BASE_LEN;
784 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
785 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
786 dst += sizeof(tail.fc_tid);
787 crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
788 dst - (u8 *)sbi->s_fc_bh->b_data);
789 tail.fc_crc = cpu_to_le32(crc);
790 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
791 dst += sizeof(tail.fc_crc);
792 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
793
794 ext4_fc_submit_bh(sb, true);
795
796 return 0;
797}
798
799/*
800 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
801 * Returns false if there's not enough space.
802 */
803static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
804 u32 *crc)
805{
806 struct ext4_fc_tl tl;
807 u8 *dst;
808
809 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
810 if (!dst)
811 return false;
812
813 tl.fc_tag = cpu_to_le16(tag);
814 tl.fc_len = cpu_to_le16(len);
815
816 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
817 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
818
819 return true;
820}
821
822/* Same as above, but adds dentry tlv. */
823static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
824 struct ext4_fc_dentry_update *fc_dentry)
825{
826 struct ext4_fc_dentry_info fcd;
827 struct ext4_fc_tl tl;
828 int dlen = fc_dentry->fcd_name.name.len;
829 u8 *dst = ext4_fc_reserve_space(sb,
830 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
831
832 if (!dst)
833 return false;
834
835 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
836 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
837 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
838 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
839 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
840 dst += EXT4_FC_TAG_BASE_LEN;
841 memcpy(dst, &fcd, sizeof(fcd));
842 dst += sizeof(fcd);
843 memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
844
845 return true;
846}
847
848/*
849 * Writes inode in the fast commit space under TLV with tag @tag.
850 * Returns 0 on success, error on failure.
851 */
852static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
853{
854 struct ext4_inode_info *ei = EXT4_I(inode);
855 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
856 int ret;
857 struct ext4_iloc iloc;
858 struct ext4_fc_inode fc_inode;
859 struct ext4_fc_tl tl;
860 u8 *dst;
861
862 ret = ext4_get_inode_loc(inode, &iloc);
863 if (ret)
864 return ret;
865
866 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
867 inode_len = EXT4_INODE_SIZE(inode->i_sb);
868 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
869 inode_len += ei->i_extra_isize;
870
871 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
872 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
873 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
874
875 ret = -ECANCELED;
876 dst = ext4_fc_reserve_space(inode->i_sb,
877 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
878 if (!dst)
879 goto err;
880
881 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
882 dst += EXT4_FC_TAG_BASE_LEN;
883 memcpy(dst, &fc_inode, sizeof(fc_inode));
884 dst += sizeof(fc_inode);
885 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
886 ret = 0;
887err:
888 brelse(iloc.bh);
889 return ret;
890}
891
892/*
893 * Writes updated data ranges for the inode in question. Updates CRC.
894 * Returns 0 on success, error otherwise.
895 */
896static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
897{
898 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
899 struct ext4_inode_info *ei = EXT4_I(inode);
900 struct ext4_map_blocks map;
901 struct ext4_fc_add_range fc_ext;
902 struct ext4_fc_del_range lrange;
903 struct ext4_extent *ex;
904 int ret;
905
906 spin_lock(&ei->i_fc_lock);
907 if (ei->i_fc_lblk_len == 0) {
908 spin_unlock(&ei->i_fc_lock);
909 return 0;
910 }
911 old_blk_size = ei->i_fc_lblk_start;
912 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
913 ei->i_fc_lblk_len = 0;
914 spin_unlock(&ei->i_fc_lock);
915
916 cur_lblk_off = old_blk_size;
917 ext4_debug("will try writing %d to %d for inode %ld\n",
918 cur_lblk_off, new_blk_size, inode->i_ino);
919
920 while (cur_lblk_off <= new_blk_size) {
921 map.m_lblk = cur_lblk_off;
922 map.m_len = new_blk_size - cur_lblk_off + 1;
923 ret = ext4_map_blocks(NULL, inode, &map,
924 EXT4_GET_BLOCKS_IO_SUBMIT |
925 EXT4_EX_NOCACHE);
926 if (ret < 0)
927 return -ECANCELED;
928
929 if (map.m_len == 0) {
930 cur_lblk_off++;
931 continue;
932 }
933
934 if (ret == 0) {
935 lrange.fc_ino = cpu_to_le32(inode->i_ino);
936 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
937 lrange.fc_len = cpu_to_le32(map.m_len);
938 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
939 sizeof(lrange), (u8 *)&lrange, crc))
940 return -ENOSPC;
941 } else {
942 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
943 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
944
945 /* Limit the number of blocks in one extent */
946 map.m_len = min(max, map.m_len);
947
948 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
949 ex = (struct ext4_extent *)&fc_ext.fc_ex;
950 ex->ee_block = cpu_to_le32(map.m_lblk);
951 ex->ee_len = cpu_to_le16(map.m_len);
952 ext4_ext_store_pblock(ex, map.m_pblk);
953 if (map.m_flags & EXT4_MAP_UNWRITTEN)
954 ext4_ext_mark_unwritten(ex);
955 else
956 ext4_ext_mark_initialized(ex);
957 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
958 sizeof(fc_ext), (u8 *)&fc_ext, crc))
959 return -ENOSPC;
960 }
961
962 cur_lblk_off += map.m_len;
963 }
964
965 return 0;
966}
967
968
969/* Flushes data of all the inodes in the commit queue. */
970static int ext4_fc_flush_data(journal_t *journal)
971{
972 struct super_block *sb = journal->j_private;
973 struct ext4_sb_info *sbi = EXT4_SB(sb);
974 struct ext4_inode_info *ei;
975 int ret = 0;
976
977 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
978 ret = jbd2_submit_inode_data(journal, ei->jinode);
979 if (ret)
980 return ret;
981 }
982
983 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
984 ret = jbd2_wait_inode_data(journal, ei->jinode);
985 if (ret)
986 return ret;
987 }
988
989 return 0;
990}
991
992/* Commit all the directory entry updates */
993static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
994{
995 struct super_block *sb = journal->j_private;
996 struct ext4_sb_info *sbi = EXT4_SB(sb);
997 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
998 struct inode *inode;
999 struct ext4_inode_info *ei;
1000 int ret;
1001
1002 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1003 return 0;
1004 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1005 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1006 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1007 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1008 return -ENOSPC;
1009 continue;
1010 }
1011 /*
1012 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1013 * corresponding inode. Also, the corresponding inode could have been
1014 * deleted, in which case, we don't need to do anything.
1015 */
1016 if (list_empty(&fc_dentry->fcd_dilist))
1017 continue;
1018 ei = list_first_entry(&fc_dentry->fcd_dilist,
1019 struct ext4_inode_info, i_fc_dilist);
1020 inode = &ei->vfs_inode;
1021 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1022
1023 /*
1024 * We first write the inode and then the create dirent. This
1025 * allows the recovery code to create an unnamed inode first
1026 * and then link it to a directory entry. This allows us
1027 * to use namei.c routines almost as is and simplifies
1028 * the recovery code.
1029 */
1030 ret = ext4_fc_write_inode(inode, crc);
1031 if (ret)
1032 return ret;
1033 ret = ext4_fc_write_inode_data(inode, crc);
1034 if (ret)
1035 return ret;
1036 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1037 return -ENOSPC;
1038 }
1039 return 0;
1040}
1041
1042static int ext4_fc_perform_commit(journal_t *journal)
1043{
1044 struct super_block *sb = journal->j_private;
1045 struct ext4_sb_info *sbi = EXT4_SB(sb);
1046 struct ext4_inode_info *iter;
1047 struct ext4_fc_head head;
1048 struct inode *inode;
1049 struct blk_plug plug;
1050 int ret = 0;
1051 u32 crc = 0;
1052 int alloc_ctx;
1053
1054 /*
1055 * Step 1: Mark all inodes on s_fc_q[MAIN] with
1056 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
1057 * freed until the data flush is over.
1058 */
1059 alloc_ctx = ext4_fc_lock(sb);
1060 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1061 ext4_set_inode_state(&iter->vfs_inode,
1062 EXT4_STATE_FC_FLUSHING_DATA);
1063 }
1064 ext4_fc_unlock(sb, alloc_ctx);
1065
1066 /* Step 2: Flush data for all the eligible inodes. */
1067 ret = ext4_fc_flush_data(journal);
1068
1069 /*
1070 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
1071 * any error from step 2. This ensures that waiters waiting on
1072 * EXT4_STATE_FC_FLUSHING_DATA can resume.
1073 */
1074 alloc_ctx = ext4_fc_lock(sb);
1075 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1076 ext4_clear_inode_state(&iter->vfs_inode,
1077 EXT4_STATE_FC_FLUSHING_DATA);
1078#if (BITS_PER_LONG < 64)
1079 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
1080#else
1081 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
1082#endif
1083 }
1084
1085 /*
1086 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
1087 * the waiter checks the bit. Pairs with implicit barrier in
1088 * prepare_to_wait() in ext4_fc_del().
1089 */
1090 smp_mb();
1091 ext4_fc_unlock(sb, alloc_ctx);
1092
1093 /*
1094 * If we encountered error in Step 2, return it now after clearing
1095 * EXT4_STATE_FC_FLUSHING_DATA bit.
1096 */
1097 if (ret)
1098 return ret;
1099
1100
1101 /* Step 4: Mark all inodes as being committed. */
1102 jbd2_journal_lock_updates(journal);
1103 /*
1104 * The journal is now locked. No more handles can start and all the
1105 * previous handles are now drained. We now mark the inodes on the
1106 * commit queue as being committed.
1107 */
1108 alloc_ctx = ext4_fc_lock(sb);
1109 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1110 ext4_set_inode_state(&iter->vfs_inode,
1111 EXT4_STATE_FC_COMMITTING);
1112 }
1113 ext4_fc_unlock(sb, alloc_ctx);
1114 jbd2_journal_unlock_updates(journal);
1115
1116 /*
1117 * Step 5: If file system device is different from journal device,
1118 * issue a cache flush before we start writing fast commit blocks.
1119 */
1120 if (journal->j_fs_dev != journal->j_dev)
1121 blkdev_issue_flush(journal->j_fs_dev);
1122
1123 blk_start_plug(&plug);
1124 alloc_ctx = ext4_fc_lock(sb);
1125 /* Step 6: Write fast commit blocks to disk. */
1126 if (sbi->s_fc_bytes == 0) {
1127 /*
1128 * Step 6.1: Add a head tag only if this is the first fast
1129 * commit in this TID.
1130 */
1131 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1132 head.fc_tid = cpu_to_le32(
1133 sbi->s_journal->j_running_transaction->t_tid);
1134 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1135 (u8 *)&head, &crc)) {
1136 ret = -ENOSPC;
1137 goto out;
1138 }
1139 }
1140
1141 /* Step 6.2: Now write all the dentry updates. */
1142 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1143 if (ret)
1144 goto out;
1145
1146 /* Step 6.3: Now write all the changed inodes to disk. */
1147 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1148 inode = &iter->vfs_inode;
1149 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1150 continue;
1151
1152 ret = ext4_fc_write_inode_data(inode, &crc);
1153 if (ret)
1154 goto out;
1155 ret = ext4_fc_write_inode(inode, &crc);
1156 if (ret)
1157 goto out;
1158 }
1159 /* Step 6.4: Finally write tail tag to conclude this fast commit. */
1160 ret = ext4_fc_write_tail(sb, crc);
1161
1162out:
1163 ext4_fc_unlock(sb, alloc_ctx);
1164 blk_finish_plug(&plug);
1165 return ret;
1166}
1167
1168static void ext4_fc_update_stats(struct super_block *sb, int status,
1169 u64 commit_time, int nblks, tid_t commit_tid)
1170{
1171 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1172
1173 ext4_debug("Fast commit ended with status = %d for tid %u",
1174 status, commit_tid);
1175 if (status == EXT4_FC_STATUS_OK) {
1176 stats->fc_num_commits++;
1177 stats->fc_numblks += nblks;
1178 if (likely(stats->s_fc_avg_commit_time))
1179 stats->s_fc_avg_commit_time =
1180 (commit_time +
1181 stats->s_fc_avg_commit_time * 3) / 4;
1182 else
1183 stats->s_fc_avg_commit_time = commit_time;
1184 } else if (status == EXT4_FC_STATUS_FAILED ||
1185 status == EXT4_FC_STATUS_INELIGIBLE) {
1186 if (status == EXT4_FC_STATUS_FAILED)
1187 stats->fc_failed_commits++;
1188 stats->fc_ineligible_commits++;
1189 } else {
1190 stats->fc_skipped_commits++;
1191 }
1192 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1193}
1194
1195/*
1196 * The main commit entry point. Performs a fast commit for transaction
1197 * commit_tid if needed. If it's not possible to perform a fast commit
1198 * due to various reasons, we fall back to full commit. Returns 0
1199 * on success, error otherwise.
1200 */
1201int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1202{
1203 struct super_block *sb = journal->j_private;
1204 struct ext4_sb_info *sbi = EXT4_SB(sb);
1205 int nblks = 0, ret, bsize = journal->j_blocksize;
1206 int subtid = atomic_read(&sbi->s_fc_subtid);
1207 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1208 ktime_t start_time, commit_time;
1209 int old_ioprio, journal_ioprio;
1210
1211 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1212 return jbd2_complete_transaction(journal, commit_tid);
1213
1214 trace_ext4_fc_commit_start(sb, commit_tid);
1215
1216 start_time = ktime_get();
1217 old_ioprio = get_current_ioprio();
1218
1219restart_fc:
1220 ret = jbd2_fc_begin_commit(journal, commit_tid);
1221 if (ret == -EALREADY) {
1222 /* There was an ongoing commit, check if we need to restart */
1223 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1224 tid_gt(commit_tid, journal->j_commit_sequence))
1225 goto restart_fc;
1226 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1227 commit_tid);
1228 return 0;
1229 } else if (ret) {
1230 /*
1231 * Commit couldn't start. Just update stats and perform a
1232 * full commit.
1233 */
1234 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1235 commit_tid);
1236 return jbd2_complete_transaction(journal, commit_tid);
1237 }
1238
1239 /*
1240 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1241 * if we are fast commit ineligible.
1242 */
1243 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1244 status = EXT4_FC_STATUS_INELIGIBLE;
1245 goto fallback;
1246 }
1247
1248 /*
1249 * Now that we know that this thread is going to do a fast commit,
1250 * elevate the priority to match that of the journal thread.
1251 */
1252 if (journal->j_task->io_context)
1253 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
1254 else
1255 journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
1256 set_task_ioprio(current, journal_ioprio);
1257 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1258 ret = ext4_fc_perform_commit(journal);
1259 if (ret < 0) {
1260 status = EXT4_FC_STATUS_FAILED;
1261 goto fallback;
1262 }
1263 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1264 ret = jbd2_fc_wait_bufs(journal, nblks);
1265 if (ret < 0) {
1266 status = EXT4_FC_STATUS_FAILED;
1267 goto fallback;
1268 }
1269 atomic_inc(&sbi->s_fc_subtid);
1270 ret = jbd2_fc_end_commit(journal);
1271 set_task_ioprio(current, old_ioprio);
1272 /*
1273 * weight the commit time higher than the average time so we
1274 * don't react too strongly to vast changes in the commit time
1275 */
1276 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1277 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1278 return ret;
1279
1280fallback:
1281 set_task_ioprio(current, old_ioprio);
1282 ret = jbd2_fc_end_commit_fallback(journal);
1283 ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1284 return ret;
1285}
1286
1287/*
1288 * Fast commit cleanup routine. This is called after every fast commit and
1289 * full commit. full is true if we are called after a full commit.
1290 */
1291static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1292{
1293 struct super_block *sb = journal->j_private;
1294 struct ext4_sb_info *sbi = EXT4_SB(sb);
1295 struct ext4_inode_info *ei;
1296 struct ext4_fc_dentry_update *fc_dentry;
1297 int alloc_ctx;
1298
1299 if (full && sbi->s_fc_bh)
1300 sbi->s_fc_bh = NULL;
1301
1302 trace_ext4_fc_cleanup(journal, full, tid);
1303 jbd2_fc_release_bufs(journal);
1304
1305 alloc_ctx = ext4_fc_lock(sb);
1306 while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
1307 ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
1308 struct ext4_inode_info,
1309 i_fc_list);
1310 list_del_init(&ei->i_fc_list);
1311 ext4_clear_inode_state(&ei->vfs_inode,
1312 EXT4_STATE_FC_COMMITTING);
1313 if (tid_geq(tid, ei->i_sync_tid)) {
1314 ext4_fc_reset_inode(&ei->vfs_inode);
1315 } else if (full) {
1316 /*
1317 * We are called after a full commit, inode has been
1318 * modified while the commit was running. Re-enqueue
1319 * the inode into STAGING, which will then be splice
1320 * back into MAIN. This cannot happen during
1321 * fastcommit because the journal is locked all the
1322 * time in that case (and tid doesn't increase so
1323 * tid check above isn't reliable).
1324 */
1325 list_add_tail(&ei->i_fc_list,
1326 &sbi->s_fc_q[FC_Q_STAGING]);
1327 }
1328 /*
1329 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
1330 * visible before we send the wakeup. Pairs with implicit
1331 * barrier in prepare_to_wait() in ext4_fc_track_inode().
1332 */
1333 smp_mb();
1334#if (BITS_PER_LONG < 64)
1335 wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
1336#else
1337 wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
1338#endif
1339 }
1340
1341 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1342 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1343 struct ext4_fc_dentry_update,
1344 fcd_list);
1345 list_del_init(&fc_dentry->fcd_list);
1346 list_del_init(&fc_dentry->fcd_dilist);
1347
1348 release_dentry_name_snapshot(&fc_dentry->fcd_name);
1349 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1350 }
1351
1352 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1353 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1354 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1355 &sbi->s_fc_q[FC_Q_MAIN]);
1356
1357 if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1358 sbi->s_fc_ineligible_tid = 0;
1359 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1360 }
1361
1362 if (full)
1363 sbi->s_fc_bytes = 0;
1364 ext4_fc_unlock(sb, alloc_ctx);
1365 trace_ext4_fc_stats(sb);
1366}
1367
1368/* Ext4 Replay Path Routines */
1369
1370/* Helper struct for dentry replay routines */
1371struct dentry_info_args {
1372 int parent_ino, dname_len, ino, inode_len;
1373 char *dname;
1374};
1375
1376/* Same as struct ext4_fc_tl, but uses native endianness fields */
1377struct ext4_fc_tl_mem {
1378 u16 fc_tag;
1379 u16 fc_len;
1380};
1381
1382static inline void tl_to_darg(struct dentry_info_args *darg,
1383 struct ext4_fc_tl_mem *tl, u8 *val)
1384{
1385 struct ext4_fc_dentry_info fcd;
1386
1387 memcpy(&fcd, val, sizeof(fcd));
1388
1389 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1390 darg->ino = le32_to_cpu(fcd.fc_ino);
1391 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1392 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1393}
1394
1395static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1396{
1397 struct ext4_fc_tl tl_disk;
1398
1399 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1400 tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1401 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1402}
1403
1404/* Unlink replay function */
1405static int ext4_fc_replay_unlink(struct super_block *sb,
1406 struct ext4_fc_tl_mem *tl, u8 *val)
1407{
1408 struct inode *inode, *old_parent;
1409 struct qstr entry;
1410 struct dentry_info_args darg;
1411 int ret = 0;
1412
1413 tl_to_darg(&darg, tl, val);
1414
1415 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1416 darg.parent_ino, darg.dname_len);
1417
1418 entry.name = darg.dname;
1419 entry.len = darg.dname_len;
1420 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421
1422 if (IS_ERR(inode)) {
1423 ext4_debug("Inode %d not found", darg.ino);
1424 return 0;
1425 }
1426
1427 old_parent = ext4_iget(sb, darg.parent_ino,
1428 EXT4_IGET_NORMAL);
1429 if (IS_ERR(old_parent)) {
1430 ext4_debug("Dir with inode %d not found", darg.parent_ino);
1431 iput(inode);
1432 return 0;
1433 }
1434
1435 ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1436 /* -ENOENT ok coz it might not exist anymore. */
1437 if (ret == -ENOENT)
1438 ret = 0;
1439 iput(old_parent);
1440 iput(inode);
1441 return ret;
1442}
1443
1444static int ext4_fc_replay_link_internal(struct super_block *sb,
1445 struct dentry_info_args *darg,
1446 struct inode *inode)
1447{
1448 struct inode *dir = NULL;
1449 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1450 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1451 int ret = 0;
1452
1453 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1454 if (IS_ERR(dir)) {
1455 ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1456 dir = NULL;
1457 goto out;
1458 }
1459
1460 dentry_dir = d_obtain_alias(dir);
1461 if (IS_ERR(dentry_dir)) {
1462 ext4_debug("Failed to obtain dentry");
1463 dentry_dir = NULL;
1464 goto out;
1465 }
1466
1467 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1468 if (!dentry_inode) {
1469 ext4_debug("Inode dentry not created.");
1470 ret = -ENOMEM;
1471 goto out;
1472 }
1473
1474 ret = __ext4_link(dir, inode, dentry_inode);
1475 /*
1476 * It's possible that link already existed since data blocks
1477 * for the dir in question got persisted before we crashed OR
1478 * we replayed this tag and crashed before the entire replay
1479 * could complete.
1480 */
1481 if (ret && ret != -EEXIST) {
1482 ext4_debug("Failed to link\n");
1483 goto out;
1484 }
1485
1486 ret = 0;
1487out:
1488 if (dentry_dir) {
1489 d_drop(dentry_dir);
1490 dput(dentry_dir);
1491 } else if (dir) {
1492 iput(dir);
1493 }
1494 if (dentry_inode) {
1495 d_drop(dentry_inode);
1496 dput(dentry_inode);
1497 }
1498
1499 return ret;
1500}
1501
1502/* Link replay function */
1503static int ext4_fc_replay_link(struct super_block *sb,
1504 struct ext4_fc_tl_mem *tl, u8 *val)
1505{
1506 struct inode *inode;
1507 struct dentry_info_args darg;
1508 int ret = 0;
1509
1510 tl_to_darg(&darg, tl, val);
1511 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1512 darg.parent_ino, darg.dname_len);
1513
1514 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1515 if (IS_ERR(inode)) {
1516 ext4_debug("Inode not found.");
1517 return 0;
1518 }
1519
1520 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1521 iput(inode);
1522 return ret;
1523}
1524
1525/*
1526 * Record all the modified inodes during replay. We use this later to setup
1527 * block bitmaps correctly.
1528 */
1529static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1530{
1531 struct ext4_fc_replay_state *state;
1532 int i;
1533
1534 state = &EXT4_SB(sb)->s_fc_replay_state;
1535 for (i = 0; i < state->fc_modified_inodes_used; i++)
1536 if (state->fc_modified_inodes[i] == ino)
1537 return 0;
1538 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1539 int *fc_modified_inodes;
1540
1541 fc_modified_inodes = krealloc(state->fc_modified_inodes,
1542 sizeof(int) * (state->fc_modified_inodes_size +
1543 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1544 GFP_KERNEL);
1545 if (!fc_modified_inodes)
1546 return -ENOMEM;
1547 state->fc_modified_inodes = fc_modified_inodes;
1548 state->fc_modified_inodes_size +=
1549 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1550 }
1551 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1552 return 0;
1553}
1554
1555/*
1556 * Inode replay function
1557 */
1558static int ext4_fc_replay_inode(struct super_block *sb,
1559 struct ext4_fc_tl_mem *tl, u8 *val)
1560{
1561 struct ext4_fc_inode fc_inode;
1562 struct ext4_inode *raw_inode;
1563 struct ext4_inode *raw_fc_inode;
1564 struct inode *inode = NULL;
1565 struct ext4_iloc iloc;
1566 int inode_len, ino, ret, tag = tl->fc_tag;
1567 struct ext4_extent_header *eh;
1568 size_t off_gen = offsetof(struct ext4_inode, i_generation);
1569
1570 memcpy(&fc_inode, val, sizeof(fc_inode));
1571
1572 ino = le32_to_cpu(fc_inode.fc_ino);
1573 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1574
1575 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1576 if (!IS_ERR(inode)) {
1577 ext4_ext_clear_bb(inode);
1578 iput(inode);
1579 }
1580 inode = NULL;
1581
1582 ret = ext4_fc_record_modified_inode(sb, ino);
1583 if (ret)
1584 goto out;
1585
1586 raw_fc_inode = (struct ext4_inode *)
1587 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1588 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1589 if (ret)
1590 goto out;
1591
1592 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1593 raw_inode = ext4_raw_inode(&iloc);
1594
1595 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1596 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1597 inode_len - off_gen);
1598 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1599 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1600 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1601 memset(eh, 0, sizeof(*eh));
1602 eh->eh_magic = EXT4_EXT_MAGIC;
1603 eh->eh_max = cpu_to_le16(
1604 (sizeof(raw_inode->i_block) -
1605 sizeof(struct ext4_extent_header))
1606 / sizeof(struct ext4_extent));
1607 }
1608 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1609 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1610 sizeof(raw_inode->i_block));
1611 }
1612
1613 /* Immediately update the inode on disk. */
1614 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1615 if (ret)
1616 goto out;
1617 ret = sync_dirty_buffer(iloc.bh);
1618 if (ret)
1619 goto out;
1620 ret = ext4_mark_inode_used(sb, ino);
1621 if (ret)
1622 goto out;
1623
1624 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1625 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1626 if (IS_ERR(inode)) {
1627 ext4_debug("Inode not found.");
1628 return -EFSCORRUPTED;
1629 }
1630
1631 /*
1632 * Our allocator could have made different decisions than before
1633 * crashing. This should be fixed but until then, we calculate
1634 * the number of blocks the inode.
1635 */
1636 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1637 ext4_ext_replay_set_iblocks(inode);
1638
1639 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1640 ext4_reset_inode_seed(inode);
1641
1642 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1643 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1644 sync_dirty_buffer(iloc.bh);
1645 brelse(iloc.bh);
1646out:
1647 iput(inode);
1648 if (!ret)
1649 blkdev_issue_flush(sb->s_bdev);
1650
1651 return 0;
1652}
1653
1654/*
1655 * Dentry create replay function.
1656 *
1657 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1658 * inode for which we are trying to create a dentry here, should already have
1659 * been replayed before we start here.
1660 */
1661static int ext4_fc_replay_create(struct super_block *sb,
1662 struct ext4_fc_tl_mem *tl, u8 *val)
1663{
1664 int ret = 0;
1665 struct inode *inode = NULL;
1666 struct inode *dir = NULL;
1667 struct dentry_info_args darg;
1668
1669 tl_to_darg(&darg, tl, val);
1670
1671 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1672 darg.parent_ino, darg.dname_len);
1673
1674 /* This takes care of update group descriptor and other metadata */
1675 ret = ext4_mark_inode_used(sb, darg.ino);
1676 if (ret)
1677 goto out;
1678
1679 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1680 if (IS_ERR(inode)) {
1681 ext4_debug("inode %d not found.", darg.ino);
1682 inode = NULL;
1683 ret = -EINVAL;
1684 goto out;
1685 }
1686
1687 if (S_ISDIR(inode->i_mode)) {
1688 /*
1689 * If we are creating a directory, we need to make sure that the
1690 * dot and dot dot dirents are setup properly.
1691 */
1692 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1693 if (IS_ERR(dir)) {
1694 ext4_debug("Dir %d not found.", darg.ino);
1695 goto out;
1696 }
1697 ret = ext4_init_new_dir(NULL, dir, inode);
1698 iput(dir);
1699 if (ret) {
1700 ret = 0;
1701 goto out;
1702 }
1703 }
1704 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1705 if (ret)
1706 goto out;
1707 set_nlink(inode, 1);
1708 ext4_mark_inode_dirty(NULL, inode);
1709out:
1710 iput(inode);
1711 return ret;
1712}
1713
1714/*
1715 * Record physical disk regions which are in use as per fast commit area,
1716 * and used by inodes during replay phase. Our simple replay phase
1717 * allocator excludes these regions from allocation.
1718 */
1719int ext4_fc_record_regions(struct super_block *sb, int ino,
1720 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1721{
1722 struct ext4_fc_replay_state *state;
1723 struct ext4_fc_alloc_region *region;
1724
1725 state = &EXT4_SB(sb)->s_fc_replay_state;
1726 /*
1727 * during replay phase, the fc_regions_valid may not same as
1728 * fc_regions_used, update it when do new additions.
1729 */
1730 if (replay && state->fc_regions_used != state->fc_regions_valid)
1731 state->fc_regions_used = state->fc_regions_valid;
1732 if (state->fc_regions_used == state->fc_regions_size) {
1733 struct ext4_fc_alloc_region *fc_regions;
1734
1735 fc_regions = krealloc(state->fc_regions,
1736 sizeof(struct ext4_fc_alloc_region) *
1737 (state->fc_regions_size +
1738 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1739 GFP_KERNEL);
1740 if (!fc_regions)
1741 return -ENOMEM;
1742 state->fc_regions_size +=
1743 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1744 state->fc_regions = fc_regions;
1745 }
1746 region = &state->fc_regions[state->fc_regions_used++];
1747 region->ino = ino;
1748 region->lblk = lblk;
1749 region->pblk = pblk;
1750 region->len = len;
1751
1752 if (replay)
1753 state->fc_regions_valid++;
1754
1755 return 0;
1756}
1757
1758/* Replay add range tag */
1759static int ext4_fc_replay_add_range(struct super_block *sb,
1760 struct ext4_fc_tl_mem *tl, u8 *val)
1761{
1762 struct ext4_fc_add_range fc_add_ex;
1763 struct ext4_extent newex, *ex;
1764 struct inode *inode;
1765 ext4_lblk_t start, cur;
1766 int remaining, len;
1767 ext4_fsblk_t start_pblk;
1768 struct ext4_map_blocks map;
1769 struct ext4_ext_path *path = NULL;
1770 int ret;
1771
1772 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1773 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1774
1775 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1776 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1777 ext4_ext_get_actual_len(ex));
1778
1779 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1780 if (IS_ERR(inode)) {
1781 ext4_debug("Inode not found.");
1782 return 0;
1783 }
1784
1785 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1786 if (ret)
1787 goto out;
1788
1789 start = le32_to_cpu(ex->ee_block);
1790 start_pblk = ext4_ext_pblock(ex);
1791 len = ext4_ext_get_actual_len(ex);
1792
1793 cur = start;
1794 remaining = len;
1795 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1796 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1797 inode->i_ino);
1798
1799 while (remaining > 0) {
1800 map.m_lblk = cur;
1801 map.m_len = remaining;
1802 map.m_pblk = 0;
1803 ret = ext4_map_blocks(NULL, inode, &map, 0);
1804
1805 if (ret < 0)
1806 goto out;
1807
1808 if (ret == 0) {
1809 /* Range is not mapped */
1810 path = ext4_find_extent(inode, cur, path, 0);
1811 if (IS_ERR(path))
1812 goto out;
1813 memset(&newex, 0, sizeof(newex));
1814 newex.ee_block = cpu_to_le32(cur);
1815 ext4_ext_store_pblock(
1816 &newex, start_pblk + cur - start);
1817 newex.ee_len = cpu_to_le16(map.m_len);
1818 if (ext4_ext_is_unwritten(ex))
1819 ext4_ext_mark_unwritten(&newex);
1820 down_write(&EXT4_I(inode)->i_data_sem);
1821 path = ext4_ext_insert_extent(NULL, inode,
1822 path, &newex, 0);
1823 up_write((&EXT4_I(inode)->i_data_sem));
1824 if (IS_ERR(path))
1825 goto out;
1826 goto next;
1827 }
1828
1829 if (start_pblk + cur - start != map.m_pblk) {
1830 /*
1831 * Logical to physical mapping changed. This can happen
1832 * if this range was removed and then reallocated to
1833 * map to new physical blocks during a fast commit.
1834 */
1835 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1836 ext4_ext_is_unwritten(ex),
1837 start_pblk + cur - start);
1838 if (ret)
1839 goto out;
1840 /*
1841 * Mark the old blocks as free since they aren't used
1842 * anymore. We maintain an array of all the modified
1843 * inodes. In case these blocks are still used at either
1844 * a different logical range in the same inode or in
1845 * some different inode, we will mark them as allocated
1846 * at the end of the FC replay using our array of
1847 * modified inodes.
1848 */
1849 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1850 goto next;
1851 }
1852
1853 /* Range is mapped and needs a state change */
1854 ext4_debug("Converting from %ld to %d %lld",
1855 map.m_flags & EXT4_MAP_UNWRITTEN,
1856 ext4_ext_is_unwritten(ex), map.m_pblk);
1857 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1858 ext4_ext_is_unwritten(ex), map.m_pblk);
1859 if (ret)
1860 goto out;
1861 /*
1862 * We may have split the extent tree while toggling the state.
1863 * Try to shrink the extent tree now.
1864 */
1865 ext4_ext_replay_shrink_inode(inode, start + len);
1866next:
1867 cur += map.m_len;
1868 remaining -= map.m_len;
1869 }
1870 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1871 sb->s_blocksize_bits);
1872out:
1873 ext4_free_ext_path(path);
1874 iput(inode);
1875 return 0;
1876}
1877
1878/* Replay DEL_RANGE tag */
1879static int
1880ext4_fc_replay_del_range(struct super_block *sb,
1881 struct ext4_fc_tl_mem *tl, u8 *val)
1882{
1883 struct inode *inode;
1884 struct ext4_fc_del_range lrange;
1885 struct ext4_map_blocks map;
1886 ext4_lblk_t cur, remaining;
1887 int ret;
1888
1889 memcpy(&lrange, val, sizeof(lrange));
1890 cur = le32_to_cpu(lrange.fc_lblk);
1891 remaining = le32_to_cpu(lrange.fc_len);
1892
1893 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1894 le32_to_cpu(lrange.fc_ino), cur, remaining);
1895
1896 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1897 if (IS_ERR(inode)) {
1898 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1899 return 0;
1900 }
1901
1902 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1903 if (ret)
1904 goto out;
1905
1906 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1907 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1908 le32_to_cpu(lrange.fc_len));
1909 while (remaining > 0) {
1910 map.m_lblk = cur;
1911 map.m_len = remaining;
1912
1913 ret = ext4_map_blocks(NULL, inode, &map, 0);
1914 if (ret < 0)
1915 goto out;
1916 if (ret > 0) {
1917 remaining -= ret;
1918 cur += ret;
1919 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1920 } else {
1921 remaining -= map.m_len;
1922 cur += map.m_len;
1923 }
1924 }
1925
1926 down_write(&EXT4_I(inode)->i_data_sem);
1927 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1928 le32_to_cpu(lrange.fc_lblk) +
1929 le32_to_cpu(lrange.fc_len) - 1);
1930 up_write(&EXT4_I(inode)->i_data_sem);
1931 if (ret)
1932 goto out;
1933 ext4_ext_replay_shrink_inode(inode,
1934 i_size_read(inode) >> sb->s_blocksize_bits);
1935 ext4_mark_inode_dirty(NULL, inode);
1936out:
1937 iput(inode);
1938 return 0;
1939}
1940
1941static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1942{
1943 struct ext4_fc_replay_state *state;
1944 struct inode *inode;
1945 struct ext4_ext_path *path = NULL;
1946 struct ext4_map_blocks map;
1947 int i, ret, j;
1948 ext4_lblk_t cur, end;
1949
1950 state = &EXT4_SB(sb)->s_fc_replay_state;
1951 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1952 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1953 EXT4_IGET_NORMAL);
1954 if (IS_ERR(inode)) {
1955 ext4_debug("Inode %d not found.",
1956 state->fc_modified_inodes[i]);
1957 continue;
1958 }
1959 cur = 0;
1960 end = EXT_MAX_BLOCKS;
1961 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1962 iput(inode);
1963 continue;
1964 }
1965 while (cur < end) {
1966 map.m_lblk = cur;
1967 map.m_len = end - cur;
1968
1969 ret = ext4_map_blocks(NULL, inode, &map, 0);
1970 if (ret < 0)
1971 break;
1972
1973 if (ret > 0) {
1974 path = ext4_find_extent(inode, map.m_lblk, path, 0);
1975 if (!IS_ERR(path)) {
1976 for (j = 0; j < path->p_depth; j++)
1977 ext4_mb_mark_bb(inode->i_sb,
1978 path[j].p_block, 1, true);
1979 } else {
1980 path = NULL;
1981 }
1982 cur += ret;
1983 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1984 map.m_len, true);
1985 } else {
1986 cur = cur + (map.m_len ? map.m_len : 1);
1987 }
1988 }
1989 iput(inode);
1990 }
1991
1992 ext4_free_ext_path(path);
1993}
1994
1995/*
1996 * Check if block is in excluded regions for block allocation. The simple
1997 * allocator that runs during replay phase is calls this function to see
1998 * if it is okay to use a block.
1999 */
2000bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
2001{
2002 int i;
2003 struct ext4_fc_replay_state *state;
2004
2005 state = &EXT4_SB(sb)->s_fc_replay_state;
2006 for (i = 0; i < state->fc_regions_valid; i++) {
2007 if (state->fc_regions[i].ino == 0 ||
2008 state->fc_regions[i].len == 0)
2009 continue;
2010 if (in_range(blk, state->fc_regions[i].pblk,
2011 state->fc_regions[i].len))
2012 return true;
2013 }
2014 return false;
2015}
2016
2017/* Cleanup function called after replay */
2018void ext4_fc_replay_cleanup(struct super_block *sb)
2019{
2020 struct ext4_sb_info *sbi = EXT4_SB(sb);
2021
2022 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2023 kfree(sbi->s_fc_replay_state.fc_regions);
2024 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2025}
2026
2027static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2028 int tag, int len)
2029{
2030 switch (tag) {
2031 case EXT4_FC_TAG_ADD_RANGE:
2032 return len == sizeof(struct ext4_fc_add_range);
2033 case EXT4_FC_TAG_DEL_RANGE:
2034 return len == sizeof(struct ext4_fc_del_range);
2035 case EXT4_FC_TAG_CREAT:
2036 case EXT4_FC_TAG_LINK:
2037 case EXT4_FC_TAG_UNLINK:
2038 len -= sizeof(struct ext4_fc_dentry_info);
2039 return len >= 1 && len <= EXT4_NAME_LEN;
2040 case EXT4_FC_TAG_INODE:
2041 len -= sizeof(struct ext4_fc_inode);
2042 return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2043 len <= sbi->s_inode_size;
2044 case EXT4_FC_TAG_PAD:
2045 return true; /* padding can have any length */
2046 case EXT4_FC_TAG_TAIL:
2047 return len >= sizeof(struct ext4_fc_tail);
2048 case EXT4_FC_TAG_HEAD:
2049 return len == sizeof(struct ext4_fc_head);
2050 }
2051 return false;
2052}
2053
2054/*
2055 * Recovery Scan phase handler
2056 *
2057 * This function is called during the scan phase and is responsible
2058 * for doing following things:
2059 * - Make sure the fast commit area has valid tags for replay
2060 * - Count number of tags that need to be replayed by the replay handler
2061 * - Verify CRC
2062 * - Create a list of excluded blocks for allocation during replay phase
2063 *
2064 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2065 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2066 * to indicate that scan has finished and JBD2 can now start replay phase.
2067 * It returns a negative error to indicate that there was an error. At the end
2068 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2069 * to indicate the number of tags that need to replayed during the replay phase.
2070 */
2071static int ext4_fc_replay_scan(journal_t *journal,
2072 struct buffer_head *bh, int off,
2073 tid_t expected_tid)
2074{
2075 struct super_block *sb = journal->j_private;
2076 struct ext4_sb_info *sbi = EXT4_SB(sb);
2077 struct ext4_fc_replay_state *state;
2078 int ret = JBD2_FC_REPLAY_CONTINUE;
2079 struct ext4_fc_add_range ext;
2080 struct ext4_fc_tl_mem tl;
2081 struct ext4_fc_tail tail;
2082 __u8 *start, *end, *cur, *val;
2083 struct ext4_fc_head head;
2084 struct ext4_extent *ex;
2085
2086 state = &sbi->s_fc_replay_state;
2087
2088 start = (u8 *)bh->b_data;
2089 end = start + journal->j_blocksize;
2090
2091 if (state->fc_replay_expected_off == 0) {
2092 state->fc_cur_tag = 0;
2093 state->fc_replay_num_tags = 0;
2094 state->fc_crc = 0;
2095 state->fc_regions = NULL;
2096 state->fc_regions_valid = state->fc_regions_used =
2097 state->fc_regions_size = 0;
2098 /* Check if we can stop early */
2099 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2100 != EXT4_FC_TAG_HEAD)
2101 return 0;
2102 }
2103
2104 if (off != state->fc_replay_expected_off) {
2105 ret = -EFSCORRUPTED;
2106 goto out_err;
2107 }
2108
2109 state->fc_replay_expected_off++;
2110 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2111 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2112 ext4_fc_get_tl(&tl, cur);
2113 val = cur + EXT4_FC_TAG_BASE_LEN;
2114 if (tl.fc_len > end - val ||
2115 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2116 ret = state->fc_replay_num_tags ?
2117 JBD2_FC_REPLAY_STOP : -ECANCELED;
2118 goto out_err;
2119 }
2120 ext4_debug("Scan phase, tag:%s, blk %lld\n",
2121 tag2str(tl.fc_tag), bh->b_blocknr);
2122 switch (tl.fc_tag) {
2123 case EXT4_FC_TAG_ADD_RANGE:
2124 memcpy(&ext, val, sizeof(ext));
2125 ex = (struct ext4_extent *)&ext.fc_ex;
2126 ret = ext4_fc_record_regions(sb,
2127 le32_to_cpu(ext.fc_ino),
2128 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2129 ext4_ext_get_actual_len(ex), 0);
2130 if (ret < 0)
2131 break;
2132 ret = JBD2_FC_REPLAY_CONTINUE;
2133 fallthrough;
2134 case EXT4_FC_TAG_DEL_RANGE:
2135 case EXT4_FC_TAG_LINK:
2136 case EXT4_FC_TAG_UNLINK:
2137 case EXT4_FC_TAG_CREAT:
2138 case EXT4_FC_TAG_INODE:
2139 case EXT4_FC_TAG_PAD:
2140 state->fc_cur_tag++;
2141 state->fc_crc = ext4_chksum(state->fc_crc, cur,
2142 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2143 break;
2144 case EXT4_FC_TAG_TAIL:
2145 state->fc_cur_tag++;
2146 memcpy(&tail, val, sizeof(tail));
2147 state->fc_crc = ext4_chksum(state->fc_crc, cur,
2148 EXT4_FC_TAG_BASE_LEN +
2149 offsetof(struct ext4_fc_tail,
2150 fc_crc));
2151 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2152 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2153 state->fc_replay_num_tags = state->fc_cur_tag;
2154 state->fc_regions_valid =
2155 state->fc_regions_used;
2156 } else {
2157 ret = state->fc_replay_num_tags ?
2158 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2159 }
2160 state->fc_crc = 0;
2161 break;
2162 case EXT4_FC_TAG_HEAD:
2163 memcpy(&head, val, sizeof(head));
2164 if (le32_to_cpu(head.fc_features) &
2165 ~EXT4_FC_SUPPORTED_FEATURES) {
2166 ret = -EOPNOTSUPP;
2167 break;
2168 }
2169 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2170 ret = JBD2_FC_REPLAY_STOP;
2171 break;
2172 }
2173 state->fc_cur_tag++;
2174 state->fc_crc = ext4_chksum(state->fc_crc, cur,
2175 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2176 break;
2177 default:
2178 ret = state->fc_replay_num_tags ?
2179 JBD2_FC_REPLAY_STOP : -ECANCELED;
2180 }
2181 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2182 break;
2183 }
2184
2185out_err:
2186 trace_ext4_fc_replay_scan(sb, ret, off);
2187 return ret;
2188}
2189
2190/*
2191 * Main recovery path entry point.
2192 * The meaning of return codes is similar as above.
2193 */
2194static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2195 enum passtype pass, int off, tid_t expected_tid)
2196{
2197 struct super_block *sb = journal->j_private;
2198 struct ext4_sb_info *sbi = EXT4_SB(sb);
2199 struct ext4_fc_tl_mem tl;
2200 __u8 *start, *end, *cur, *val;
2201 int ret = JBD2_FC_REPLAY_CONTINUE;
2202 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2203 struct ext4_fc_tail tail;
2204
2205 if (pass == PASS_SCAN) {
2206 state->fc_current_pass = PASS_SCAN;
2207 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2208 }
2209
2210 if (state->fc_current_pass != pass) {
2211 state->fc_current_pass = pass;
2212 sbi->s_mount_state |= EXT4_FC_REPLAY;
2213 }
2214 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2215 ext4_debug("Replay stops\n");
2216 ext4_fc_set_bitmaps_and_counters(sb);
2217 return 0;
2218 }
2219
2220#ifdef CONFIG_EXT4_DEBUG
2221 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2222 pr_warn("Dropping fc block %d because max_replay set\n", off);
2223 return JBD2_FC_REPLAY_STOP;
2224 }
2225#endif
2226
2227 start = (u8 *)bh->b_data;
2228 end = start + journal->j_blocksize;
2229
2230 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2231 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2232 ext4_fc_get_tl(&tl, cur);
2233 val = cur + EXT4_FC_TAG_BASE_LEN;
2234
2235 if (state->fc_replay_num_tags == 0) {
2236 ret = JBD2_FC_REPLAY_STOP;
2237 ext4_fc_set_bitmaps_and_counters(sb);
2238 break;
2239 }
2240
2241 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2242 state->fc_replay_num_tags--;
2243 switch (tl.fc_tag) {
2244 case EXT4_FC_TAG_LINK:
2245 ret = ext4_fc_replay_link(sb, &tl, val);
2246 break;
2247 case EXT4_FC_TAG_UNLINK:
2248 ret = ext4_fc_replay_unlink(sb, &tl, val);
2249 break;
2250 case EXT4_FC_TAG_ADD_RANGE:
2251 ret = ext4_fc_replay_add_range(sb, &tl, val);
2252 break;
2253 case EXT4_FC_TAG_CREAT:
2254 ret = ext4_fc_replay_create(sb, &tl, val);
2255 break;
2256 case EXT4_FC_TAG_DEL_RANGE:
2257 ret = ext4_fc_replay_del_range(sb, &tl, val);
2258 break;
2259 case EXT4_FC_TAG_INODE:
2260 ret = ext4_fc_replay_inode(sb, &tl, val);
2261 break;
2262 case EXT4_FC_TAG_PAD:
2263 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2264 tl.fc_len, 0);
2265 break;
2266 case EXT4_FC_TAG_TAIL:
2267 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2268 0, tl.fc_len, 0);
2269 memcpy(&tail, val, sizeof(tail));
2270 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2271 break;
2272 case EXT4_FC_TAG_HEAD:
2273 break;
2274 default:
2275 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2276 ret = -ECANCELED;
2277 break;
2278 }
2279 if (ret < 0)
2280 break;
2281 ret = JBD2_FC_REPLAY_CONTINUE;
2282 }
2283 return ret;
2284}
2285
2286void ext4_fc_init(struct super_block *sb, journal_t *journal)
2287{
2288 /*
2289 * We set replay callback even if fast commit disabled because we may
2290 * could still have fast commit blocks that need to be replayed even if
2291 * fast commit has now been turned off.
2292 */
2293 journal->j_fc_replay_callback = ext4_fc_replay;
2294 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2295 return;
2296 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2297}
2298
2299static const char * const fc_ineligible_reasons[] = {
2300 [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2301 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2302 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2303 [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2304 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2305 [EXT4_FC_REASON_RESIZE] = "Resize",
2306 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2307 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2308 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2309 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2310 [EXT4_FC_REASON_MIGRATE] = "Inode format migration",
2311 [EXT4_FC_REASON_VERITY] = "fs-verity enable",
2312 [EXT4_FC_REASON_MOVE_EXT] = "Move extents",
2313};
2314
2315int ext4_fc_info_show(struct seq_file *seq, void *v)
2316{
2317 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2318 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2319 int i;
2320
2321 if (v != SEQ_START_TOKEN)
2322 return 0;
2323
2324 seq_printf(seq,
2325 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2326 stats->fc_num_commits, stats->fc_ineligible_commits,
2327 stats->fc_numblks,
2328 div_u64(stats->s_fc_avg_commit_time, 1000));
2329 seq_puts(seq, "Ineligible reasons:\n");
2330 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2331 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2332 stats->fc_ineligible_reason_count[i]);
2333
2334 return 0;
2335}
2336
2337int __init ext4_fc_init_dentry_cache(void)
2338{
2339 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2340 SLAB_RECLAIM_ACCOUNT);
2341
2342 if (ext4_fc_dentry_cachep == NULL)
2343 return -ENOMEM;
2344
2345 return 0;
2346}
2347
2348void ext4_fc_destroy_dentry_cache(void)
2349{
2350 kmem_cache_destroy(ext4_fc_dentry_cachep);
2351}