Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * linux/fs/jbd2/commit.c
4 *
5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 *
7 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 *
9 * Journal commit routines for the generic filesystem journaling code;
10 * part of the ext2fs journaling system.
11 */
12
13#include <linux/time.h>
14#include <linux/fs.h>
15#include <linux/jbd2.h>
16#include <linux/errno.h>
17#include <linux/slab.h>
18#include <linux/mm.h>
19#include <linux/pagemap.h>
20#include <linux/jiffies.h>
21#include <linux/crc32.h>
22#include <linux/writeback.h>
23#include <linux/backing-dev.h>
24#include <linux/bio.h>
25#include <linux/blkdev.h>
26#include <linux/bitops.h>
27#include <trace/events/jbd2.h>
28
29/*
30 * IO end handler for temporary buffer_heads handling writes to the journal.
31 */
32static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33{
34 struct buffer_head *orig_bh = bh->b_private;
35
36 BUFFER_TRACE(bh, "");
37 if (uptodate)
38 set_buffer_uptodate(bh);
39 else
40 clear_buffer_uptodate(bh);
41 if (orig_bh) {
42 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43 smp_mb__after_atomic();
44 wake_up_bit(&orig_bh->b_state, BH_Shadow);
45 }
46 unlock_buffer(bh);
47}
48
49/*
50 * When an ext4 file is truncated, it is possible that some pages are not
51 * successfully freed, because they are attached to a committing transaction.
52 * After the transaction commits, these pages are left on the LRU, with no
53 * ->mapping, and with attached buffers. These pages are trivially reclaimable
54 * by the VM, but their apparent absence upsets the VM accounting, and it makes
55 * the numbers in /proc/meminfo look odd.
56 *
57 * So here, we have a buffer which has just come off the forget list. Look to
58 * see if we can strip all buffers from the backing page.
59 *
60 * Called under j_list_lock. The caller provided us with a ref against the
61 * buffer, and we drop that here.
62 */
63static void release_buffer_page(struct buffer_head *bh)
64{
65 struct folio *folio;
66
67 if (buffer_dirty(bh))
68 goto nope;
69 if (atomic_read(&bh->b_count) != 1)
70 goto nope;
71 folio = bh->b_folio;
72 if (folio->mapping)
73 goto nope;
74
75 /* OK, it's a truncated page */
76 if (!folio_trylock(folio))
77 goto nope;
78
79 folio_get(folio);
80 __brelse(bh);
81 try_to_free_buffers(folio);
82 folio_unlock(folio);
83 folio_put(folio);
84 return;
85
86nope:
87 __brelse(bh);
88}
89
90static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
91{
92 struct commit_header *h;
93 __u32 csum;
94
95 if (!jbd2_journal_has_csum_v2or3(j))
96 return;
97
98 h = (struct commit_header *)(bh->b_data);
99 h->h_chksum_type = 0;
100 h->h_chksum_size = 0;
101 h->h_chksum[0] = 0;
102 csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
103 h->h_chksum[0] = cpu_to_be32(csum);
104}
105
106/*
107 * Done it all: now submit the commit record. We should have
108 * cleaned up our previous buffers by now, so if we are in abort
109 * mode we can now just skip the rest of the journal write
110 * entirely.
111 *
112 * Returns 1 if the journal needs to be aborted or 0 on success
113 */
114static int journal_submit_commit_record(journal_t *journal,
115 transaction_t *commit_transaction,
116 struct buffer_head **cbh,
117 __u32 crc32_sum)
118{
119 struct commit_header *tmp;
120 struct buffer_head *bh;
121 struct timespec64 now;
122 blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS;
123
124 *cbh = NULL;
125
126 if (is_journal_aborted(journal))
127 return 0;
128
129 bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
130 JBD2_COMMIT_BLOCK);
131 if (!bh)
132 return 1;
133
134 tmp = (struct commit_header *)bh->b_data;
135 ktime_get_coarse_real_ts64(&now);
136 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
137 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
138
139 if (jbd2_has_feature_checksum(journal)) {
140 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
141 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
142 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
143 }
144 jbd2_commit_block_csum_set(journal, bh);
145
146 BUFFER_TRACE(bh, "submit commit block");
147 lock_buffer(bh);
148 clear_buffer_dirty(bh);
149 set_buffer_uptodate(bh);
150 bh->b_end_io = journal_end_buffer_io_sync;
151
152 if (journal->j_flags & JBD2_BARRIER &&
153 !jbd2_has_feature_async_commit(journal))
154 write_flags |= REQ_PREFLUSH | REQ_FUA;
155
156 submit_bh(write_flags, bh);
157 *cbh = bh;
158 return 0;
159}
160
161/*
162 * This function along with journal_submit_commit_record
163 * allows to write the commit record asynchronously.
164 */
165static int journal_wait_on_commit_record(journal_t *journal,
166 struct buffer_head *bh)
167{
168 int ret = 0;
169
170 clear_buffer_dirty(bh);
171 wait_on_buffer(bh);
172
173 if (unlikely(!buffer_uptodate(bh)))
174 ret = -EIO;
175 put_bh(bh); /* One for getblk() */
176
177 return ret;
178}
179
180/* Send all the data buffers related to an inode */
181int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
182{
183 unsigned long flags;
184
185 if (!jinode)
186 return 0;
187
188 flags = READ_ONCE(jinode->i_flags);
189 if (!(flags & JI_WRITE_DATA))
190 return 0;
191
192 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
193 return journal->j_submit_inode_data_buffers(jinode);
194
195}
196EXPORT_SYMBOL(jbd2_submit_inode_data);
197
198int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
199{
200 struct address_space *mapping;
201 struct inode *inode;
202 unsigned long flags;
203 loff_t start_byte, end_byte;
204
205 if (!jinode)
206 return 0;
207
208 flags = READ_ONCE(jinode->i_flags);
209 if (!(flags & JI_WAIT_DATA))
210 return 0;
211
212 inode = jinode->i_vfs_inode;
213 if (!inode)
214 return 0;
215
216 mapping = inode->i_mapping;
217 if (!mapping)
218 return 0;
219
220 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
221 return 0;
222 return filemap_fdatawait_range_keep_errors(
223 mapping, start_byte, end_byte);
224}
225EXPORT_SYMBOL(jbd2_wait_inode_data);
226
227/*
228 * Submit all the data buffers of inode associated with the transaction to
229 * disk.
230 *
231 * We are in a committing transaction. Therefore no new inode can be added to
232 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
233 * operate on from being released while we write out pages.
234 */
235static int journal_submit_data_buffers(journal_t *journal,
236 transaction_t *commit_transaction)
237{
238 struct jbd2_inode *jinode;
239 int err, ret = 0;
240
241 spin_lock(&journal->j_list_lock);
242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
243 if (!(jinode->i_flags & JI_WRITE_DATA))
244 continue;
245 WRITE_ONCE(jinode->i_flags,
246 jinode->i_flags | JI_COMMIT_RUNNING);
247 spin_unlock(&journal->j_list_lock);
248 /* submit the inode data buffers. */
249 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
250 if (journal->j_submit_inode_data_buffers) {
251 err = journal->j_submit_inode_data_buffers(jinode);
252 if (!ret)
253 ret = err;
254 }
255 spin_lock(&journal->j_list_lock);
256 J_ASSERT(jinode->i_transaction == commit_transaction);
257 WRITE_ONCE(jinode->i_flags,
258 jinode->i_flags & ~JI_COMMIT_RUNNING);
259 smp_mb();
260 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
261 }
262 spin_unlock(&journal->j_list_lock);
263 return ret;
264}
265
266int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
267{
268 struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
269 loff_t start_byte, end_byte;
270
271 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte))
272 return 0;
273
274 return filemap_fdatawait_range_keep_errors(mapping,
275 start_byte, end_byte);
276}
277
278/*
279 * Wait for data submitted for writeout, refile inodes to proper
280 * transaction if needed.
281 *
282 */
283static int journal_finish_inode_data_buffers(journal_t *journal,
284 transaction_t *commit_transaction)
285{
286 struct jbd2_inode *jinode, *next_i;
287 int err, ret = 0;
288
289 /* For locking, see the comment in journal_submit_data_buffers() */
290 spin_lock(&journal->j_list_lock);
291 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
292 if (!(jinode->i_flags & JI_WAIT_DATA))
293 continue;
294 WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING);
295 spin_unlock(&journal->j_list_lock);
296 /* wait for the inode data buffers writeout. */
297 if (journal->j_finish_inode_data_buffers) {
298 err = journal->j_finish_inode_data_buffers(jinode);
299 if (!ret)
300 ret = err;
301 }
302 cond_resched();
303 spin_lock(&journal->j_list_lock);
304 WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING);
305 smp_mb();
306 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
307 }
308
309 /* Now refile inode to proper lists */
310 list_for_each_entry_safe(jinode, next_i,
311 &commit_transaction->t_inode_list, i_list) {
312 list_del(&jinode->i_list);
313 if (jinode->i_next_transaction) {
314 jinode->i_transaction = jinode->i_next_transaction;
315 jinode->i_next_transaction = NULL;
316 list_add(&jinode->i_list,
317 &jinode->i_transaction->t_inode_list);
318 } else {
319 jinode->i_transaction = NULL;
320 WRITE_ONCE(jinode->i_dirty_start_page, 0);
321 WRITE_ONCE(jinode->i_dirty_end_page, 0);
322 }
323 }
324 spin_unlock(&journal->j_list_lock);
325
326 return ret;
327}
328
329static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
330{
331 char *addr;
332 __u32 checksum;
333
334 addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
335 checksum = crc32_be(crc32_sum, addr, bh->b_size);
336 kunmap_local(addr);
337
338 return checksum;
339}
340
341static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
342 unsigned long long block)
343{
344 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
345 if (jbd2_has_feature_64bit(j))
346 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
347}
348
349static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
350 struct buffer_head *bh, __u32 sequence)
351{
352 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
353 __u8 *addr;
354 __u32 csum32;
355 __be32 seq;
356
357 if (!jbd2_journal_has_csum_v2or3(j))
358 return;
359
360 seq = cpu_to_be32(sequence);
361 addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
362 csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
363 csum32 = jbd2_chksum(csum32, addr, bh->b_size);
364 kunmap_local(addr);
365
366 if (jbd2_has_feature_csum3(j))
367 tag3->t_checksum = cpu_to_be32(csum32);
368 else
369 tag->t_checksum = cpu_to_be16(csum32);
370}
371/*
372 * jbd2_journal_commit_transaction
373 *
374 * The primary function for committing a transaction to the log. This
375 * function is called by the journal thread to begin a complete commit.
376 */
377void jbd2_journal_commit_transaction(journal_t *journal)
378{
379 struct transaction_stats_s stats;
380 transaction_t *commit_transaction;
381 struct journal_head *jh;
382 struct buffer_head *descriptor;
383 struct buffer_head **wbuf = journal->j_wbuf;
384 int bufs;
385 int escape;
386 int err;
387 unsigned long long blocknr;
388 ktime_t start_time;
389 u64 commit_time;
390 char *tagp = NULL;
391 journal_block_tag_t *tag = NULL;
392 int space_left = 0;
393 int first_tag = 0;
394 int tag_flag;
395 int i;
396 int tag_bytes = journal_tag_bytes(journal);
397 struct buffer_head *cbh = NULL; /* For transactional checksums */
398 __u32 crc32_sum = ~0;
399 struct blk_plug plug;
400 /* Tail of the journal */
401 unsigned long first_block;
402 tid_t first_tid;
403 int update_tail;
404 int csum_size = 0;
405 LIST_HEAD(io_bufs);
406 LIST_HEAD(log_bufs);
407
408 if (jbd2_journal_has_csum_v2or3(journal))
409 csum_size = sizeof(struct jbd2_journal_block_tail);
410
411 /*
412 * First job: lock down the current transaction and wait for
413 * all outstanding updates to complete.
414 */
415
416 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
417 if (journal->j_flags & JBD2_FLUSHED) {
418 jbd2_debug(3, "super block updated\n");
419 mutex_lock_io(&journal->j_checkpoint_mutex);
420 /*
421 * We hold j_checkpoint_mutex so tail cannot change under us.
422 * We don't need any special data guarantees for writing sb
423 * since journal is empty and it is ok for write to be
424 * flushed only with transaction commit.
425 */
426 jbd2_journal_update_sb_log_tail(journal,
427 journal->j_tail_sequence,
428 journal->j_tail, 0);
429 mutex_unlock(&journal->j_checkpoint_mutex);
430 } else {
431 jbd2_debug(3, "superblock not updated\n");
432 }
433
434 J_ASSERT(journal->j_running_transaction != NULL);
435 J_ASSERT(journal->j_committing_transaction == NULL);
436
437 write_lock(&journal->j_state_lock);
438 journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
439 while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
440 DEFINE_WAIT(wait);
441
442 prepare_to_wait(&journal->j_fc_wait, &wait,
443 TASK_UNINTERRUPTIBLE);
444 write_unlock(&journal->j_state_lock);
445 schedule();
446 write_lock(&journal->j_state_lock);
447 finish_wait(&journal->j_fc_wait, &wait);
448 /*
449 * TODO: by blocking fast commits here, we are increasing
450 * fsync() latency slightly. Strictly speaking, we don't need
451 * to block fast commits until the transaction enters T_FLUSH
452 * state. So an optimization is possible where we block new fast
453 * commits here and wait for existing ones to complete
454 * just before we enter T_FLUSH. That way, the existing fast
455 * commits and this full commit can proceed parallely.
456 */
457 }
458 write_unlock(&journal->j_state_lock);
459
460 commit_transaction = journal->j_running_transaction;
461
462 trace_jbd2_start_commit(journal, commit_transaction);
463 jbd2_debug(1, "JBD2: starting commit of transaction %d\n",
464 commit_transaction->t_tid);
465
466 write_lock(&journal->j_state_lock);
467 journal->j_fc_off = 0;
468 J_ASSERT(commit_transaction->t_state == T_RUNNING);
469 commit_transaction->t_state = T_LOCKED;
470
471 trace_jbd2_commit_locking(journal, commit_transaction);
472 stats.run.rs_wait = commit_transaction->t_max_wait;
473 stats.run.rs_request_delay = 0;
474 stats.run.rs_locked = jiffies;
475 if (commit_transaction->t_requested)
476 stats.run.rs_request_delay =
477 jbd2_time_diff(commit_transaction->t_requested,
478 stats.run.rs_locked);
479 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
480 stats.run.rs_locked);
481
482 // waits for any t_updates to finish
483 jbd2_journal_wait_updates(journal);
484
485 commit_transaction->t_state = T_SWITCH;
486
487 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
488 journal->j_max_transaction_buffers);
489
490 /*
491 * First thing we are allowed to do is to discard any remaining
492 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
493 * that there are no such buffers: if a large filesystem
494 * operation like a truncate needs to split itself over multiple
495 * transactions, then it may try to do a jbd2_journal_restart() while
496 * there are still BJ_Reserved buffers outstanding. These must
497 * be released cleanly from the current transaction.
498 *
499 * In this case, the filesystem must still reserve write access
500 * again before modifying the buffer in the new transaction, but
501 * we do not require it to remember exactly which old buffers it
502 * has reserved. This is consistent with the existing behaviour
503 * that multiple jbd2_journal_get_write_access() calls to the same
504 * buffer are perfectly permissible.
505 * We use journal->j_state_lock here to serialize processing of
506 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
507 */
508 while (commit_transaction->t_reserved_list) {
509 jh = commit_transaction->t_reserved_list;
510 JBUFFER_TRACE(jh, "reserved, unused: refile");
511 /*
512 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
513 * leave undo-committed data.
514 */
515 if (jh->b_committed_data) {
516 struct buffer_head *bh = jh2bh(jh);
517
518 spin_lock(&jh->b_state_lock);
519 jbd2_free(jh->b_committed_data, bh->b_size);
520 jh->b_committed_data = NULL;
521 spin_unlock(&jh->b_state_lock);
522 }
523 jbd2_journal_refile_buffer(journal, jh);
524 }
525
526 write_unlock(&journal->j_state_lock);
527 /*
528 * Now try to drop any written-back buffers from the journal's
529 * checkpoint lists. We do this *before* commit because it potentially
530 * frees some memory
531 */
532 spin_lock(&journal->j_list_lock);
533 __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
534 spin_unlock(&journal->j_list_lock);
535
536 jbd2_debug(3, "JBD2: commit phase 1\n");
537
538 /*
539 * Clear revoked flag to reflect there is no revoked buffers
540 * in the next transaction which is going to be started.
541 */
542 jbd2_clear_buffer_revoked_flags(journal);
543
544 /*
545 * Switch to a new revoke table.
546 */
547 jbd2_journal_switch_revoke_table(journal);
548
549 write_lock(&journal->j_state_lock);
550 /*
551 * Reserved credits cannot be claimed anymore, free them
552 */
553 atomic_sub(atomic_read(&journal->j_reserved_credits),
554 &commit_transaction->t_outstanding_credits);
555
556 trace_jbd2_commit_flushing(journal, commit_transaction);
557 stats.run.rs_flushing = jiffies;
558 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
559 stats.run.rs_flushing);
560
561 commit_transaction->t_state = T_FLUSH;
562 journal->j_committing_transaction = commit_transaction;
563 journal->j_running_transaction = NULL;
564 start_time = ktime_get();
565 commit_transaction->t_log_start = journal->j_head;
566 wake_up_all(&journal->j_wait_transaction_locked);
567 write_unlock(&journal->j_state_lock);
568
569 jbd2_debug(3, "JBD2: commit phase 2a\n");
570
571 /*
572 * Now start flushing things to disk, in the order they appear
573 * on the transaction lists. Data blocks go first.
574 */
575 err = journal_submit_data_buffers(journal, commit_transaction);
576 if (err)
577 jbd2_journal_abort(journal, err);
578
579 blk_start_plug(&plug);
580 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
581
582 jbd2_debug(3, "JBD2: commit phase 2b\n");
583
584 /*
585 * Way to go: we have now written out all of the data for a
586 * transaction! Now comes the tricky part: we need to write out
587 * metadata. Loop over the transaction's entire buffer list:
588 */
589 write_lock(&journal->j_state_lock);
590 commit_transaction->t_state = T_COMMIT;
591 write_unlock(&journal->j_state_lock);
592
593 trace_jbd2_commit_logging(journal, commit_transaction);
594 stats.run.rs_logging = jiffies;
595 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
596 stats.run.rs_logging);
597 stats.run.rs_blocks = commit_transaction->t_nr_buffers;
598 stats.run.rs_blocks_logged = 0;
599
600 J_ASSERT(commit_transaction->t_nr_buffers <=
601 atomic_read(&commit_transaction->t_outstanding_credits));
602
603 bufs = 0;
604 descriptor = NULL;
605 while (commit_transaction->t_buffers) {
606
607 /* Find the next buffer to be journaled... */
608
609 jh = commit_transaction->t_buffers;
610
611 /* If we're in abort mode, we just un-journal the buffer and
612 release it. */
613
614 if (is_journal_aborted(journal)) {
615 clear_buffer_jbddirty(jh2bh(jh));
616 JBUFFER_TRACE(jh, "journal is aborting: refile");
617 jbd2_buffer_abort_trigger(jh,
618 jh->b_frozen_data ?
619 jh->b_frozen_triggers :
620 jh->b_triggers);
621 jbd2_journal_refile_buffer(journal, jh);
622 /* If that was the last one, we need to clean up
623 * any descriptor buffers which may have been
624 * already allocated, even if we are now
625 * aborting. */
626 if (!commit_transaction->t_buffers)
627 goto start_journal_io;
628 continue;
629 }
630
631 /* Make sure we have a descriptor block in which to
632 record the metadata buffer. */
633
634 if (!descriptor) {
635 J_ASSERT (bufs == 0);
636
637 jbd2_debug(4, "JBD2: get descriptor\n");
638
639 descriptor = jbd2_journal_get_descriptor_buffer(
640 commit_transaction,
641 JBD2_DESCRIPTOR_BLOCK);
642 if (!descriptor) {
643 jbd2_journal_abort(journal, -EIO);
644 continue;
645 }
646
647 jbd2_debug(4, "JBD2: got buffer %llu (%p)\n",
648 (unsigned long long)descriptor->b_blocknr,
649 descriptor->b_data);
650 tagp = &descriptor->b_data[sizeof(journal_header_t)];
651 space_left = descriptor->b_size -
652 sizeof(journal_header_t);
653 first_tag = 1;
654 set_buffer_jwrite(descriptor);
655 set_buffer_dirty(descriptor);
656 wbuf[bufs++] = descriptor;
657
658 /* Record it so that we can wait for IO
659 completion later */
660 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
661 jbd2_file_log_bh(&log_bufs, descriptor);
662 }
663
664 /* Where is the buffer to be written? */
665
666 err = jbd2_journal_next_log_block(journal, &blocknr);
667 /* If the block mapping failed, just abandon the buffer
668 and repeat this loop: we'll fall into the
669 refile-on-abort condition above. */
670 if (err) {
671 jbd2_journal_abort(journal, err);
672 continue;
673 }
674
675 /*
676 * start_this_handle() uses t_outstanding_credits to determine
677 * the free space in the log.
678 */
679 atomic_dec(&commit_transaction->t_outstanding_credits);
680
681 /* Bump b_count to prevent truncate from stumbling over
682 the shadowed buffer! @@@ This can go if we ever get
683 rid of the shadow pairing of buffers. */
684 atomic_inc(&jh2bh(jh)->b_count);
685
686 /*
687 * Make a temporary IO buffer with which to write it out
688 * (this will requeue the metadata buffer to BJ_Shadow).
689 */
690 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
691 JBUFFER_TRACE(jh, "ph3: write metadata");
692 escape = jbd2_journal_write_metadata_buffer(commit_transaction,
693 jh, &wbuf[bufs], blocknr);
694 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
695
696 /* Record the new block's tag in the current descriptor
697 buffer */
698
699 tag_flag = 0;
700 if (escape)
701 tag_flag |= JBD2_FLAG_ESCAPE;
702 if (!first_tag)
703 tag_flag |= JBD2_FLAG_SAME_UUID;
704
705 tag = (journal_block_tag_t *) tagp;
706 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
707 tag->t_flags = cpu_to_be16(tag_flag);
708 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
709 commit_transaction->t_tid);
710 tagp += tag_bytes;
711 space_left -= tag_bytes;
712 bufs++;
713
714 if (first_tag) {
715 memcpy (tagp, journal->j_uuid, 16);
716 tagp += 16;
717 space_left -= 16;
718 first_tag = 0;
719 }
720
721 /* If there's no more to do, or if the descriptor is full,
722 let the IO rip! */
723
724 if (bufs == journal->j_wbufsize ||
725 commit_transaction->t_buffers == NULL ||
726 space_left < tag_bytes + 16 + csum_size) {
727
728 jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs);
729
730 /* Write an end-of-descriptor marker before
731 submitting the IOs. "tag" still points to
732 the last tag we set up. */
733
734 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
735start_journal_io:
736 if (descriptor)
737 jbd2_descriptor_block_csum_set(journal,
738 descriptor);
739
740 for (i = 0; i < bufs; i++) {
741 struct buffer_head *bh = wbuf[i];
742
743 /*
744 * Compute checksum.
745 */
746 if (jbd2_has_feature_checksum(journal)) {
747 crc32_sum =
748 jbd2_checksum_data(crc32_sum, bh);
749 }
750
751 lock_buffer(bh);
752 clear_buffer_dirty(bh);
753 set_buffer_uptodate(bh);
754 bh->b_end_io = journal_end_buffer_io_sync;
755 submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
756 bh);
757 }
758 cond_resched();
759
760 /* Force a new descriptor to be generated next
761 time round the loop. */
762 descriptor = NULL;
763 bufs = 0;
764 }
765 }
766
767 err = journal_finish_inode_data_buffers(journal, commit_transaction);
768 if (err) {
769 printk(KERN_WARNING
770 "JBD2: Detected IO errors %d while flushing file data on %s\n",
771 err, journal->j_devname);
772 err = 0;
773 }
774
775 /*
776 * Get current oldest transaction in the log before we issue flush
777 * to the filesystem device. After the flush we can be sure that
778 * blocks of all older transactions are checkpointed to persistent
779 * storage and we will be safe to update journal start in the
780 * superblock with the numbers we get here.
781 */
782 update_tail =
783 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
784
785 write_lock(&journal->j_state_lock);
786 if (update_tail) {
787 long freed = first_block - journal->j_tail;
788
789 if (first_block < journal->j_tail)
790 freed += journal->j_last - journal->j_first;
791 /* Update tail only if we free significant amount of space */
792 if (freed < journal->j_max_transaction_buffers)
793 update_tail = 0;
794 }
795 J_ASSERT(commit_transaction->t_state == T_COMMIT);
796 commit_transaction->t_state = T_COMMIT_DFLUSH;
797 write_unlock(&journal->j_state_lock);
798
799 /*
800 * If the journal is not located on the file system device,
801 * then we must flush the file system device before we issue
802 * the commit record and update the journal tail sequence.
803 */
804 if ((commit_transaction->t_need_data_flush || update_tail) &&
805 (journal->j_fs_dev != journal->j_dev) &&
806 (journal->j_flags & JBD2_BARRIER))
807 blkdev_issue_flush(journal->j_fs_dev);
808
809 /* Done it all: now write the commit record asynchronously. */
810 if (jbd2_has_feature_async_commit(journal)) {
811 err = journal_submit_commit_record(journal, commit_transaction,
812 &cbh, crc32_sum);
813 if (err)
814 jbd2_journal_abort(journal, err);
815 }
816
817 blk_finish_plug(&plug);
818
819 /* Lo and behold: we have just managed to send a transaction to
820 the log. Before we can commit it, wait for the IO so far to
821 complete. Control buffers being written are on the
822 transaction's t_log_list queue, and metadata buffers are on
823 the io_bufs list.
824
825 Wait for the buffers in reverse order. That way we are
826 less likely to be woken up until all IOs have completed, and
827 so we incur less scheduling load.
828 */
829
830 jbd2_debug(3, "JBD2: commit phase 3\n");
831
832 while (!list_empty(&io_bufs)) {
833 struct buffer_head *bh = list_entry(io_bufs.prev,
834 struct buffer_head,
835 b_assoc_buffers);
836
837 wait_on_buffer(bh);
838 cond_resched();
839
840 if (unlikely(!buffer_uptodate(bh)))
841 err = -EIO;
842 jbd2_unfile_log_bh(bh);
843 stats.run.rs_blocks_logged++;
844
845 /*
846 * The list contains temporary buffer heads created by
847 * jbd2_journal_write_metadata_buffer().
848 */
849 BUFFER_TRACE(bh, "dumping temporary bh");
850 __brelse(bh);
851 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
852 free_buffer_head(bh);
853
854 /* We also have to refile the corresponding shadowed buffer */
855 jh = commit_transaction->t_shadow_list->b_tprev;
856 bh = jh2bh(jh);
857 clear_buffer_jwrite(bh);
858 J_ASSERT_BH(bh, buffer_jbddirty(bh));
859 J_ASSERT_BH(bh, !buffer_shadow(bh));
860
861 /* The metadata is now released for reuse, but we need
862 to remember it against this transaction so that when
863 we finally commit, we can do any checkpointing
864 required. */
865 JBUFFER_TRACE(jh, "file as BJ_Forget");
866 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
867 JBUFFER_TRACE(jh, "brelse shadowed buffer");
868 __brelse(bh);
869 }
870
871 J_ASSERT (commit_transaction->t_shadow_list == NULL);
872
873 jbd2_debug(3, "JBD2: commit phase 4\n");
874
875 /* Here we wait for the revoke record and descriptor record buffers */
876 while (!list_empty(&log_bufs)) {
877 struct buffer_head *bh;
878
879 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
880 wait_on_buffer(bh);
881 cond_resched();
882
883 if (unlikely(!buffer_uptodate(bh)))
884 err = -EIO;
885
886 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
887 clear_buffer_jwrite(bh);
888 jbd2_unfile_log_bh(bh);
889 stats.run.rs_blocks_logged++;
890 __brelse(bh); /* One for getblk */
891 /* AKPM: bforget here */
892 }
893
894 if (err)
895 jbd2_journal_abort(journal, err);
896
897 jbd2_debug(3, "JBD2: commit phase 5\n");
898 write_lock(&journal->j_state_lock);
899 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
900 commit_transaction->t_state = T_COMMIT_JFLUSH;
901 write_unlock(&journal->j_state_lock);
902
903 if (!jbd2_has_feature_async_commit(journal)) {
904 err = journal_submit_commit_record(journal, commit_transaction,
905 &cbh, crc32_sum);
906 if (err)
907 jbd2_journal_abort(journal, err);
908 }
909 if (cbh)
910 err = journal_wait_on_commit_record(journal, cbh);
911 stats.run.rs_blocks_logged++;
912 if (jbd2_has_feature_async_commit(journal) &&
913 journal->j_flags & JBD2_BARRIER) {
914 blkdev_issue_flush(journal->j_dev);
915 }
916
917 if (err)
918 jbd2_journal_abort(journal, err);
919
920 WARN_ON_ONCE(
921 atomic_read(&commit_transaction->t_outstanding_credits) < 0);
922
923 /*
924 * Now disk caches for filesystem device are flushed so we are safe to
925 * erase checkpointed transactions from the log by updating journal
926 * superblock.
927 */
928 if (update_tail)
929 jbd2_update_log_tail(journal, first_tid, first_block);
930
931 /* End of a transaction! Finally, we can do checkpoint
932 processing: any buffers committed as a result of this
933 transaction can be removed from any checkpoint list it was on
934 before. */
935
936 jbd2_debug(3, "JBD2: commit phase 6\n");
937
938 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
939 J_ASSERT(commit_transaction->t_buffers == NULL);
940 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
941 J_ASSERT(commit_transaction->t_shadow_list == NULL);
942
943restart_loop:
944 /*
945 * As there are other places (journal_unmap_buffer()) adding buffers
946 * to this list we have to be careful and hold the j_list_lock.
947 */
948 spin_lock(&journal->j_list_lock);
949 while (commit_transaction->t_forget) {
950 transaction_t *cp_transaction;
951 struct buffer_head *bh;
952 int try_to_free = 0;
953 bool drop_ref;
954
955 jh = commit_transaction->t_forget;
956 spin_unlock(&journal->j_list_lock);
957 bh = jh2bh(jh);
958 /*
959 * Get a reference so that bh cannot be freed before we are
960 * done with it.
961 */
962 get_bh(bh);
963 spin_lock(&jh->b_state_lock);
964 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
965
966 /*
967 * If there is undo-protected committed data against
968 * this buffer, then we can remove it now. If it is a
969 * buffer needing such protection, the old frozen_data
970 * field now points to a committed version of the
971 * buffer, so rotate that field to the new committed
972 * data.
973 *
974 * Otherwise, we can just throw away the frozen data now.
975 *
976 * We also know that the frozen data has already fired
977 * its triggers if they exist, so we can clear that too.
978 */
979 if (jh->b_committed_data) {
980 jbd2_free(jh->b_committed_data, bh->b_size);
981 jh->b_committed_data = NULL;
982 if (jh->b_frozen_data) {
983 jh->b_committed_data = jh->b_frozen_data;
984 jh->b_frozen_data = NULL;
985 jh->b_frozen_triggers = NULL;
986 }
987 } else if (jh->b_frozen_data) {
988 jbd2_free(jh->b_frozen_data, bh->b_size);
989 jh->b_frozen_data = NULL;
990 jh->b_frozen_triggers = NULL;
991 }
992
993 spin_lock(&journal->j_list_lock);
994 cp_transaction = jh->b_cp_transaction;
995 if (cp_transaction) {
996 JBUFFER_TRACE(jh, "remove from old cp transaction");
997 cp_transaction->t_chp_stats.cs_dropped++;
998 __jbd2_journal_remove_checkpoint(jh);
999 }
1000
1001 /* Only re-checkpoint the buffer_head if it is marked
1002 * dirty. If the buffer was added to the BJ_Forget list
1003 * by jbd2_journal_forget, it may no longer be dirty and
1004 * there's no point in keeping a checkpoint record for
1005 * it. */
1006
1007 /*
1008 * A buffer which has been freed while still being journaled
1009 * by a previous transaction, refile the buffer to BJ_Forget of
1010 * the running transaction. If the just committed transaction
1011 * contains "add to orphan" operation, we can completely
1012 * invalidate the buffer now. We are rather through in that
1013 * since the buffer may be still accessible when blocksize <
1014 * pagesize and it is attached to the last partial page.
1015 */
1016 if (buffer_freed(bh) && !jh->b_next_transaction) {
1017 struct address_space *mapping;
1018
1019 clear_buffer_freed(bh);
1020 clear_buffer_jbddirty(bh);
1021
1022 /*
1023 * Block device buffers need to stay mapped all the
1024 * time, so it is enough to clear buffer_jbddirty and
1025 * buffer_freed bits. For the file mapping buffers (i.e.
1026 * journalled data) we need to unmap buffer and clear
1027 * more bits. We also need to be careful about the check
1028 * because the data page mapping can get cleared under
1029 * our hands. Note that if mapping == NULL, we don't
1030 * need to make buffer unmapped because the page is
1031 * already detached from the mapping and buffers cannot
1032 * get reused.
1033 */
1034 mapping = READ_ONCE(bh->b_folio->mapping);
1035 if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1036 clear_buffer_mapped(bh);
1037 clear_buffer_new(bh);
1038 clear_buffer_req(bh);
1039 bh->b_bdev = NULL;
1040 }
1041 }
1042
1043 if (buffer_jbddirty(bh)) {
1044 JBUFFER_TRACE(jh, "add to new checkpointing trans");
1045 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1046 if (is_journal_aborted(journal))
1047 clear_buffer_jbddirty(bh);
1048 } else {
1049 J_ASSERT_BH(bh, !buffer_dirty(bh));
1050 /*
1051 * The buffer on BJ_Forget list and not jbddirty means
1052 * it has been freed by this transaction and hence it
1053 * could not have been reallocated until this
1054 * transaction has committed. *BUT* it could be
1055 * reallocated once we have written all the data to
1056 * disk and before we process the buffer on BJ_Forget
1057 * list.
1058 */
1059 if (!jh->b_next_transaction)
1060 try_to_free = 1;
1061 }
1062 JBUFFER_TRACE(jh, "refile or unfile buffer");
1063 drop_ref = __jbd2_journal_refile_buffer(jh);
1064 spin_unlock(&jh->b_state_lock);
1065 if (drop_ref)
1066 jbd2_journal_put_journal_head(jh);
1067 if (try_to_free)
1068 release_buffer_page(bh); /* Drops bh reference */
1069 else
1070 __brelse(bh);
1071 cond_resched_lock(&journal->j_list_lock);
1072 }
1073 spin_unlock(&journal->j_list_lock);
1074 /*
1075 * This is a bit sleazy. We use j_list_lock to protect transition
1076 * of a transaction into T_FINISHED state and calling
1077 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1078 * other checkpointing code processing the transaction...
1079 */
1080 write_lock(&journal->j_state_lock);
1081 spin_lock(&journal->j_list_lock);
1082 /*
1083 * Now recheck if some buffers did not get attached to the transaction
1084 * while the lock was dropped...
1085 */
1086 if (commit_transaction->t_forget) {
1087 spin_unlock(&journal->j_list_lock);
1088 write_unlock(&journal->j_state_lock);
1089 goto restart_loop;
1090 }
1091
1092 /* Add the transaction to the checkpoint list
1093 * __journal_remove_checkpoint() can not destroy transaction
1094 * under us because it is not marked as T_FINISHED yet */
1095 if (journal->j_checkpoint_transactions == NULL) {
1096 journal->j_checkpoint_transactions = commit_transaction;
1097 commit_transaction->t_cpnext = commit_transaction;
1098 commit_transaction->t_cpprev = commit_transaction;
1099 } else {
1100 commit_transaction->t_cpnext =
1101 journal->j_checkpoint_transactions;
1102 commit_transaction->t_cpprev =
1103 commit_transaction->t_cpnext->t_cpprev;
1104 commit_transaction->t_cpnext->t_cpprev =
1105 commit_transaction;
1106 commit_transaction->t_cpprev->t_cpnext =
1107 commit_transaction;
1108 }
1109 spin_unlock(&journal->j_list_lock);
1110
1111 /* Done with this transaction! */
1112
1113 jbd2_debug(3, "JBD2: commit phase 7\n");
1114
1115 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1116
1117 commit_transaction->t_start = jiffies;
1118 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1119 commit_transaction->t_start);
1120
1121 /*
1122 * File the transaction statistics
1123 */
1124 stats.ts_tid = commit_transaction->t_tid;
1125 stats.run.rs_handle_count =
1126 atomic_read(&commit_transaction->t_handle_count);
1127 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1128 commit_transaction->t_tid, &stats.run);
1129 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1130
1131 commit_transaction->t_state = T_COMMIT_CALLBACK;
1132 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1133 WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
1134 journal->j_committing_transaction = NULL;
1135 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1136
1137 /*
1138 * weight the commit time higher than the average time so we don't
1139 * react too strongly to vast changes in the commit time
1140 */
1141 if (likely(journal->j_average_commit_time))
1142 journal->j_average_commit_time = (commit_time +
1143 journal->j_average_commit_time*3) / 4;
1144 else
1145 journal->j_average_commit_time = commit_time;
1146
1147 write_unlock(&journal->j_state_lock);
1148
1149 if (journal->j_commit_callback)
1150 journal->j_commit_callback(journal, commit_transaction);
1151 if (journal->j_fc_cleanup_callback)
1152 journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1153
1154 trace_jbd2_end_commit(journal, commit_transaction);
1155 jbd2_debug(1, "JBD2: commit %d complete, head %d\n",
1156 journal->j_commit_sequence, journal->j_tail_sequence);
1157
1158 write_lock(&journal->j_state_lock);
1159 journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1160 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1161 spin_lock(&journal->j_list_lock);
1162 commit_transaction->t_state = T_FINISHED;
1163 /* Check if the transaction can be dropped now that we are finished */
1164 if (commit_transaction->t_checkpoint_list == NULL) {
1165 __jbd2_journal_drop_transaction(journal, commit_transaction);
1166 jbd2_journal_free_transaction(commit_transaction);
1167 }
1168 spin_unlock(&journal->j_list_lock);
1169 write_unlock(&journal->j_state_lock);
1170 wake_up(&journal->j_wait_done_commit);
1171 wake_up(&journal->j_fc_wait);
1172
1173 /*
1174 * Calculate overall stats
1175 */
1176 spin_lock(&journal->j_history_lock);
1177 journal->j_stats.ts_tid++;
1178 journal->j_stats.ts_requested += stats.ts_requested;
1179 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1180 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1181 journal->j_stats.run.rs_running += stats.run.rs_running;
1182 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1183 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1184 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1185 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1186 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1187 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1188 spin_unlock(&journal->j_history_lock);
1189}